GeNN  4.9.0
GPU enhanced Neuronal Networks (GeNN)
philox.h
Go to the documentation of this file.
1 /*
2 Copyright 2010-2011, D. E. Shaw Research.
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8 
9 * Redistributions of source code must retain the above copyright
10  notice, this list of conditions, and the following disclaimer.
11 
12 * Redistributions in binary form must reproduce the above copyright
13  notice, this list of conditions, and the following disclaimer in the
14  documentation and/or other materials provided with the distribution.
15 
16 * Neither the name of D. E. Shaw Research nor the names of its
17  contributors may be used to endorse or promote products derived from
18  this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 #ifndef _philox_dot_h_
33 #define _philox_dot_h_
34 
38 #include "array.h"
39 
40 
41 /*
42 // Macros _Foo_tpl are code generation 'templates' They define
43 // inline functions with names obtained by mangling Foo and the
44 // macro arguments. E.g.,
45 // _mulhilo_tpl(32, uint32_t, uint64_t)
46 // expands to a definition of:
47 // mulhilo32(uint32_t, uint32_t, uint32_t *, uint32_t *)
48 // We then 'instantiate the template' to define
49 // several different functions, e.g.,
50 // mulhilo32
51 // mulhilo64
52 // These functions will be visible to user code, and may
53 // also be used later in subsequent templates and definitions.
54 
55 // A template for mulhilo using a temporary of twice the word-width.
56 // Gcc figures out that this can be reduced to a single 'mul' instruction,
57 // despite the apparent use of double-wide variables, shifts, etc. It's
58 // obviously not guaranteed that all compilers will be that smart, so
59 // other implementations might be preferable, e.g., using an intrinsic
60 // or an asm block. On the other hand, for 32-bit multiplies,
61 // this *is* perfectly standard C99 - any C99 compiler should
62 // understand it and produce correct code. For 64-bit multiplies,
63 // it's only usable if the compiler recognizes that it can do
64 // arithmetic on a 128-bit type. That happens to be true for gcc on
65 // x86-64, and powerpc64 but not much else.
66 */
67 #define _mulhilo_dword_tpl(W, Word, Dword) \
68 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
69  Dword product = ((Dword)a)*((Dword)b); \
70  *hip = product>>W; \
71  return (Word)product; \
72 }
73 
74 /*
75 // A template for mulhilo using gnu-style asm syntax.
76 // INSN can be "mulw", "mull" or "mulq".
77 // FIXME - porting to other architectures, we'll need still-more conditional
78 // branching here. Note that intrinsics are usually preferable.
79 */
80 #ifdef __powerpc__
81 #define _mulhilo_asm_tpl(W, Word, INSN) \
82 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
83  Word dx = 0; \
84  __asm__("\n\t" \
85  INSN " %0,%1,%2\n\t" \
86  : "=r"(dx) \
87  : "r"(b), "r"(ax) \
88  ); \
89  *hip = dx; \
90  return ax*b; \
91 }
92 #else
93 #define _mulhilo_asm_tpl(W, Word, INSN) \
94 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
95  Word dx; \
96  __asm__("\n\t" \
97  INSN " %2\n\t" \
98  : "=a"(ax), "=d"(dx) \
99  : "r"(b), "0"(ax) \
100  ); \
101  *hip = dx; \
102  return ax; \
103 }
104 #endif /* __powerpc__ */
105 
106 /*
107 // A template for mulhilo using MSVC-style intrinsics
108 // For example,_umul128 is an msvc intrinsic, c.f.
109 // http://msdn.microsoft.com/en-us/library/3dayytw9.aspx
110 */
111 #define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
112 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
113  return INTRIN(a, b, hip); \
114 }
115 
116 /* N.B. This really should be called _mulhilo_mulhi_intrin. It just
117  happens that CUDA was the first time we used the idiom. */
118 #define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
119 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
120  *hip = INTRIN(a, b); \
121  return a*b; \
122 }
123 
124 /*
125 // A template for mulhilo using only word-size operations and
126 // C99 operators (no adc, no mulhi). It
127 // requires four multiplies and a dozen or so shifts, adds
128 // and tests. It's not clear what this is good for, other than
129 // completeness. On 32-bit platforms, it could be used to
130 // implement philoxNx64, but on such platforms both the philoxNx32
131 // and the threefryNx64 cbrngs are going to have much better
132 // performance. It is enabled below by R123_USE_MULHILO64_C99,
133 // but that is currently (Sep 2011) not set by any of the
134 // features/XXfeatures.h headers. It can, of course, be
135 // set with a compile-time -D option.
136 */
137 #define _mulhilo_c99_tpl(W, Word) \
138 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
139  const unsigned WHALF = W/2; \
140  const Word LOMASK = ((((Word)1)<<WHALF)-1); \
141  Word lo = a*b; /* full low multiply */ \
142  Word ahi = a>>WHALF; \
143  Word alo = a& LOMASK; \
144  Word bhi = b>>WHALF; \
145  Word blo = b& LOMASK; \
146  \
147  Word ahbl = ahi*blo; \
148  Word albh = alo*bhi; \
149  \
150  Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK)); \
151  Word hi = ahi*bhi + (ahbl>>WHALF) + (albh>>WHALF); \
152  hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \
153  /* carry from the sum with alo*blo */ \
154  hi += ((lo >> WHALF) < (ahbl_albh&LOMASK)); \
155  *hip = hi; \
156  return lo; \
157 }
158 
159 /*
160 // A template for mulhilo on a platform that can't do it
161 // We could put a C version here, but is it better to run *VERY*
162 // slowly or to just stop and force the user to find another CBRNG?
163 */
164 #define _mulhilo_fail_tpl(W, Word) \
165 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
166  R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
167 }
168 
169 /*
170 // N.B. There's an MSVC intrinsic called _emul,
171 // which *might* compile into better code than
172 // _mulhilo_dword_tpl
173 */
174 #if R123_USE_MULHILO32_ASM
175 #ifdef __powerpc__
176 _mulhilo_asm_tpl(32, uint32_t, "mulhwu")
177 #else
178 _mulhilo_asm_tpl(32, uint32_t, "mull")
179 #endif /* __powerpc__ */
180 #else
181 _mulhilo_dword_tpl(32, uint32_t, uint64_t)
182 #endif
183 
184 #if R123_USE_PHILOX_64BIT
185 #if R123_USE_MULHILO64_ASM
186 #ifdef __powerpc64__
187 _mulhilo_asm_tpl(64, uint64_t, "mulhdu")
188 #else
189 _mulhilo_asm_tpl(64, uint64_t, "mulq")
190 #endif /* __powerpc64__ */
191 #elif R123_USE_MULHILO64_MSVC_INTRIN
192 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
193 #elif R123_USE_MULHILO64_CUDA_INTRIN
194 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
195 #elif R123_USE_MULHILO64_OPENCL_INTRIN
196 _mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
197 #elif R123_USE_MULHILO64_MULHI_INTRIN
198 _mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN)
199 #elif R123_USE_GNU_UINT128
200 _mulhilo_dword_tpl(64, uint64_t, __uint128_t)
201 #elif R123_USE_MULHILO64_C99
202 _mulhilo_c99_tpl(64, uint64_t)
203 #else
204 _mulhilo_fail_tpl(64, uint64_t)
205 #endif
206 #endif
207 
208 /*
209 // The multipliers and Weyl constants are "hard coded".
210 // To change them, you can #define them with different
211 // values before #include-ing this file.
212 // This isn't terribly elegant, but it works for C as
213 // well as C++. A nice C++-only solution would be to
214 // use template parameters in the style of <random>
215 */
216 #ifndef PHILOX_M2x64_0
217 #define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
218 #endif
219 
220 #ifndef PHILOX_M4x64_0
221 #define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
222 #endif
223 
224 #ifndef PHILOX_M4x64_1
225 #define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
226 #endif
227 
228 #ifndef PHILOX_M2x32_0
229 #define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
230 #endif
231 
232 #ifndef PHILOX_M4x32_0
233 #define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
234 #endif
235 #ifndef PHILOX_M4x32_1
236 #define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
237 #endif
238 
239 #ifndef PHILOX_W64_0
240 #define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15) /* golden ratio */
241 #endif
242 #ifndef PHILOX_W64_1
243 #define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B) /* sqrt(3)-1 */
244 #endif
245 
246 #ifndef PHILOX_W32_0
247 #define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
248 #endif
249 #ifndef PHILOX_W32_1
250 #define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
251 #endif
252 
253 #ifndef PHILOX2x32_DEFAULT_ROUNDS
254 #define PHILOX2x32_DEFAULT_ROUNDS 10
255 #endif
256 
257 #ifndef PHILOX2x64_DEFAULT_ROUNDS
258 #define PHILOX2x64_DEFAULT_ROUNDS 10
259 #endif
260 
261 #ifndef PHILOX4x32_DEFAULT_ROUNDS
262 #define PHILOX4x32_DEFAULT_ROUNDS 10
263 #endif
264 
265 #ifndef PHILOX4x64_DEFAULT_ROUNDS
266 #define PHILOX4x64_DEFAULT_ROUNDS 10
267 #endif
268 
269 /* The ignored fourth argument allows us to instantiate the
270  same macro regardless of N. */
271 #define _philox2xWround_tpl(W, T) \
272 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
273 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
274  T hi; \
275  T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
276  struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}}; \
277  return out; \
278 }
279 #define _philox2xWbumpkey_tpl(W) \
280 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
281  key.v[0] += PHILOX_W##W##_0; \
282  return key; \
283 }
284 
285 #define _philox4xWround_tpl(W, T) \
286 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
287 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
288  T hi0; \
289  T hi1; \
290  T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
291  T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
292  struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1, \
293  hi0^ctr.v[3]^key.v[1], lo0}}; \
294  return out; \
295 }
296 
297 #define _philox4xWbumpkey_tpl(W) \
298 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
299  key.v[0] += PHILOX_W##W##_0; \
300  key.v[1] += PHILOX_W##W##_1; \
301  return key; \
302 }
303 
304 #define _philoxNxW_tpl(N, Nhalf, W, T) \
305  \
306 enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
307 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
308 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
309 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
310 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
311 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
312 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
313  R123_ASSERT(R<=16); \
314  if(R>0){ ctr = _philox##N##x##W##round(ctr, key); } \
315  if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
316  if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
317  if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
318  if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
319  if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
320  if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
321  if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
322  if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
323  if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
324  if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
325  if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
326  if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
327  if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
328  if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
329  if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
330  return ctr; \
331 }
332 
333 _philox2xWbumpkey_tpl(32)
334 _philox4xWbumpkey_tpl(32)
335 _philox2xWround_tpl(32, uint32_t) /* philo2x32round */
336 _philox4xWround_tpl(32, uint32_t) /* philo4x32round */
338 _philoxNxW_tpl(2, 1, 32, uint32_t) /* philox2x32bijection */
339 _philoxNxW_tpl(4, 2, 32, uint32_t) /* philox4x32bijection */
340 #if R123_USE_PHILOX_64BIT
341 
342 _philox2xWbumpkey_tpl(64)
343 _philox4xWbumpkey_tpl(64)
344 _philox2xWround_tpl(64, uint64_t) /* philo2x64round */
345 _philox4xWround_tpl(64, uint64_t) /* philo4x64round */
347 _philoxNxW_tpl(2, 1, 64, uint64_t) /* philox2x64bijection */
348 _philoxNxW_tpl(4, 2, 64, uint64_t) /* philox4x64bijection */
349 #endif /* R123_USE_PHILOX_64BIT */
350 
351 #define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
352 #define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
353 #if R123_USE_PHILOX_64BIT
354 #define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
355 #define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
356 #endif /* R123_USE_PHILOX_64BIT */
357 
358 #ifdef __cplusplus
359 #include <stdexcept>
360 
363 #define _PhiloxNxW_base_tpl(CType, KType, N, W) \
364 namespace r123{ \
365 template<unsigned int ROUNDS> \
366 struct Philox##N##x##W##_R{ \
367  typedef CType ctr_type; \
368  typedef KType key_type; \
369  typedef KType ukey_type; \
370  static const unsigned int rounds=ROUNDS; \
371  inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
372  R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
373  return philox##N##x##W##_R(ROUNDS, ctr, key); \
374  } \
375 }; \
376 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
377  } // namespace r123
378 
380 _PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) // Philox2x32_R<R>
381 _PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32) // Philox4x32_R<R>
382 #if R123_USE_PHILOX_64BIT
383 _PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64) // Philox2x64_R<R>
384 _PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64) // Philox4x64_R<R>
385 #endif
386 
387 /* The _tpl macros don't quite work to do string-pasting inside comments.
388  so we just write out the boilerplate documentation four times... */
389 
484 #endif /* __cplusplus */
485 
486 #endif /* _philox_dot_h_ */
#define R123_MULHILO64_MULHI_INTRIN
_philoxNxW_tpl(2, 1, 32, uint32_t) _philoxNxW_tpl(4