32 #ifndef _philox_dot_h_ 33 #define _philox_dot_h_ 67 #define _mulhilo_dword_tpl(W, Word, Dword) \ 68 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \ 69 Dword product = ((Dword)a)*((Dword)b); \ 71 return (Word)product; \ 81 #define _mulhilo_asm_tpl(W, Word, INSN) \ 82 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \ 85 INSN " %0,%1,%2\n\t" \ 93 #define _mulhilo_asm_tpl(W, Word, INSN) \ 94 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \ 98 : "=a"(ax), "=d"(dx) \ 111 #define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \ 112 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \ 113 return INTRIN(a, b, hip); \ 118 #define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \ 119 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \ 120 *hip = INTRIN(a, b); \ 137 #define _mulhilo_c99_tpl(W, Word) \ 138 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \ 139 const unsigned WHALF = W/2; \ 140 const Word LOMASK = ((((Word)1)<<WHALF)-1); \ 142 Word ahi = a>>WHALF; \ 143 Word alo = a& LOMASK; \ 144 Word bhi = b>>WHALF; \ 145 Word blo = b& LOMASK; \ 147 Word ahbl = ahi*blo; \ 148 Word albh = alo*bhi; \ 150 Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \ 151 Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \ 152 hi += ahbl_albh >> WHALF; \ 154 hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \ 164 #define _mulhilo_fail_tpl(W, Word) \ 165 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \ 166 R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \ 174 #if R123_USE_MULHILO32_ASM 176 _mulhilo_asm_tpl(32, uint32_t,
"mulhwu")
178 _mulhilo_asm_tpl(32, uint32_t,
"mull")
181 _mulhilo_dword_tpl(32, uint32_t, uint64_t)
184 #if R123_USE_PHILOX_64BIT 185 #if R123_USE_MULHILO64_ASM 187 _mulhilo_asm_tpl(64, uint64_t,
"mulhdu")
189 _mulhilo_asm_tpl(64, uint64_t,
"mulq")
191 #elif R123_USE_MULHILO64_MSVC_INTRIN 192 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
193 #elif R123_USE_MULHILO64_CUDA_INTRIN 194 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
195 #elif R123_USE_MULHILO64_OPENCL_INTRIN 196 _mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
197 #elif R123_USE_MULHILO64_MULHI_INTRIN 199 #elif R123_USE_GNU_UINT128 200 _mulhilo_dword_tpl(64, uint64_t, __uint128_t)
201 #elif R123_USE_MULHILO64_C99 202 _mulhilo_c99_tpl(64, uint64_t)
204 _mulhilo_fail_tpl(64, uint64_t)
216 #ifndef PHILOX_M2x64_0 217 #define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93) 220 #ifndef PHILOX_M4x64_0 221 #define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93) 224 #ifndef PHILOX_M4x64_1 225 #define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157) 228 #ifndef PHILOX_M2x32_0 229 #define PHILOX_M2x32_0 ((uint32_t)0xd256d193) 232 #ifndef PHILOX_M4x32_0 233 #define PHILOX_M4x32_0 ((uint32_t)0xD2511F53) 235 #ifndef PHILOX_M4x32_1 236 #define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57) 240 #define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15) 243 #define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B) 247 #define PHILOX_W32_0 ((uint32_t)0x9E3779B9) 250 #define PHILOX_W32_1 ((uint32_t)0xBB67AE85) 253 #ifndef PHILOX2x32_DEFAULT_ROUNDS 254 #define PHILOX2x32_DEFAULT_ROUNDS 10 257 #ifndef PHILOX2x64_DEFAULT_ROUNDS 258 #define PHILOX2x64_DEFAULT_ROUNDS 10 261 #ifndef PHILOX4x32_DEFAULT_ROUNDS 262 #define PHILOX4x32_DEFAULT_ROUNDS 10 265 #ifndef PHILOX4x64_DEFAULT_ROUNDS 266 #define PHILOX4x64_DEFAULT_ROUNDS 10 271 #define _philox2xWround_tpl(W, T) \ 272 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \ 273 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \ 275 T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \ 276 struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \ 279 #define _philox2xWbumpkey_tpl(W) \ 280 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \ 281 key.v[0] += PHILOX_W##W##_0; \ 285 #define _philox4xWround_tpl(W, T) \ 286 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \ 287 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \ 290 T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \ 291 T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \ 292 struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \ 293 hi0^ctr.v[3]^key.v[1], lo0}}; \ 297 #define _philox4xWbumpkey_tpl(W) \ 298 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \ 299 key.v[0] += PHILOX_W##W##_0; \ 300 key.v[1] += PHILOX_W##W##_1; \ 304 #define _philoxNxW_tpl(N, Nhalf, W, T) \ 306 enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \ 307 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \ 308 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \ 309 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \ 310 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \ 311 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \ 312 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \ 313 R123_ASSERT(R<=16); \ 314 if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \ 315 if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 316 if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 317 if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 318 if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 319 if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 320 if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 321 if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 322 if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 323 if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 324 if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 325 if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 326 if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 327 if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 328 if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 329 if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \ 333 _philox2xWbumpkey_tpl(32)
334 _philox4xWbumpkey_tpl(32)
335 _philox2xWround_tpl(32, uint32_t)
336 _philox4xWround_tpl(32, uint32_t)
339 _philoxNxW_tpl(4, 2, 32, uint32_t)
340 #if R123_USE_PHILOX_64BIT 342 _philox2xWbumpkey_tpl(64)
343 _philox4xWbumpkey_tpl(64)
344 _philox2xWround_tpl(64, uint64_t)
345 _philox4xWround_tpl(64, uint64_t)
347 _philoxNxW_tpl(2, 1, 64, uint64_t)
348 _philoxNxW_tpl(4, 2, 64, uint64_t)
351 #define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k) 352 #define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k) 353 #if R123_USE_PHILOX_64BIT 354 #define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k) 355 #define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k) 363 #define _PhiloxNxW_base_tpl(CType, KType, N, W) \ 365 template<unsigned int ROUNDS> \ 366 struct Philox##N##x##W##_R{ \ 367 typedef CType ctr_type; \ 368 typedef KType key_type; \ 369 typedef KType ukey_type; \ 370 static const unsigned int rounds=ROUNDS; \ 371 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \ 372 R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \ 373 return philox##N##x##W##_R(ROUNDS, ctr, key); \ 376 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \ 380 _PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32)
381 _PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32)
382 #if R123_USE_PHILOX_64BIT 383 _PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64)
384 _PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64)
#define R123_MULHILO64_MULHI_INTRIN
_philoxNxW_tpl(2, 1, 32, uint32_t) _philoxNxW_tpl(4