GeNN  4.9.0
GPU enhanced Neuronal Networks (GeNN)
cuda/backend.h
Go to the documentation of this file.
1 #pragma once
2 
3 // Standard C++ includes
4 #include <algorithm>
5 #include <array>
6 #include <functional>
7 #include <map>
8 #include <numeric>
9 #include <string>
10 
11 // Standard C includes
12 #include <cassert>
13 
14 // CUDA includes
15 #include <cuda.h>
16 #include <cuda_runtime.h>
17 
18 // GeNN includes
19 #include "backendExport.h"
20 
21 // GeNN code generator includes
25 
26 // Forward declarations
27 namespace filesystem
28 {
29  class path;
30 }
31 
32 //--------------------------------------------------------------------------
33 // CodeGenerator::CUDA::DeviceSelectMethod
34 //--------------------------------------------------------------------------
35 namespace CodeGenerator
36 {
37 namespace CUDA
38 {
40 enum class DeviceSelect
41 {
42  OPTIMAL,
43  MOST_MEMORY,
44  MANUAL,
46 };
47 
48 //--------------------------------------------------------------------------
49 // CodeGenerator::CUDA::BlockSizeSelect
50 //--------------------------------------------------------------------------
52 enum class BlockSizeSelect
53 {
54  OCCUPANCY,
55  MANUAL,
56 };
57 
58 //--------------------------------------------------------------------------
59 // CodeGenerator::CUDA::Preferences
60 //--------------------------------------------------------------------------
63 {
65  {
66  std::fill(manualBlockSizes.begin(), manualBlockSizes.end(), 32);
67  }
68 
70  bool showPtxInfo = false;
71 
73  bool generateLineInfo = false;
74 
77  bool selectGPUByDeviceID = false;
78 
80  bool enableNCCLReductions = false;
81 
85  bool generateSimpleErrorHandling = false;
86 
88  DeviceSelect deviceSelectMethod = DeviceSelect::OPTIMAL;
89 
91  unsigned int manualDeviceID = 0;
92 
94  BlockSizeSelect blockSizeSelectMethod = BlockSizeSelect::OCCUPANCY;
95 
98 
100 
103  size_t constantCacheOverhead = 72 * 5;
104 
106  std::string userNvccFlags = "";
107 
108  void updateHash(boost::uuids::detail::sha1 &hash) const
109  {
110  // Superclass
112 
113  // **NOTE** showPtxInfo, generateLineInfo and userNvccFlags only affect makefiles/msbuild
114  // **NOTE** block size optimization is also not relevant, the chosen block size is hashed in the backend
115  // **NOTE** while device selection is also not relevant as the chosen device is hashed in the backend, DeviceSelect::MANUAL_OVERRIDE is used in the backend
116 
118  Utils::updateHash(selectGPUByDeviceID, hash);
119  Utils::updateHash(deviceSelectMethod, hash);
120  Utils::updateHash(constantCacheOverhead, hash);
121  Utils::updateHash(enableNCCLReductions, hash);
122  Utils::updateHash(generateSimpleErrorHandling, hash);
123  }
124 };
125 
126 //--------------------------------------------------------------------------
127 // CodeGenerator::CUDA::Backend
128 //--------------------------------------------------------------------------
130 {
131 public:
132  Backend(const KernelBlockSize &kernelBlockSizes, const Preferences &preferences,
133  const std::string &scalarType, int device);
134 
135  //--------------------------------------------------------------------------
136  // CodeGenerator::BackendSIMT virtuals
137  //--------------------------------------------------------------------------
139  virtual bool areSharedMemAtomicsSlow() const override;
140 
142  virtual std::string getSharedPrefix() const override{ return "__shared__ "; }
143 
145  virtual std::string getThreadID(unsigned int axis = 0) const override;
146 
148  virtual std::string getBlockID(unsigned int axis = 0) const override;
149 
151  virtual std::string getCLZ() const override { return "__clz"; }
152 
154  virtual std::string getAtomic(const std::string &type, AtomicOperation op = AtomicOperation::ADD,
155  AtomicMemSpace memSpace = AtomicMemSpace::GLOBAL) const override;
156 
158  virtual void genSharedMemBarrier(CodeStream &os) const override;
159 
161  virtual void genPopulationRNGInit(CodeStream &os, const std::string &globalRNG, const std::string &seed, const std::string &sequence) const override;
162 
164  virtual void genPopulationRNGPreamble(CodeStream &os, Substitutions &subs, const std::string &globalRNG, const std::string &name = "rng") const override;
165 
167 
168  virtual void genPopulationRNGPostamble(CodeStream &os, const std::string &globalRNG) const override;
169 
171  virtual void genGlobalRNGSkipAhead(CodeStream &os, Substitutions &subs, const std::string &sequence, const std::string &name = "rng") const override;
172 
173  //--------------------------------------------------------------------------
174  // CodeGenerator::BackendBase virtuals
175  //--------------------------------------------------------------------------
176  virtual void genNeuronUpdate(CodeStream &os, const ModelSpecMerged &modelMerged,
177  HostHandler preambleHandler, HostHandler pushEGPHandler) const override;
178 
179  virtual void genSynapseUpdate(CodeStream &os, const ModelSpecMerged &modelMerged,
180  HostHandler preambleHandler, HostHandler pushEGPHandler) const override;
181 
182  virtual void genCustomUpdate(CodeStream &os, const ModelSpecMerged &modelMerged,
183  HostHandler preambleHandler, HostHandler pushEGPHandler) const override;
184 
185  virtual void genInit(CodeStream &os, const ModelSpecMerged &modelMerged,
186  HostHandler preambleHandler, HostHandler initPushEGPHandler, HostHandler initSparsePushEGPHandler) const override;
187 
188  virtual void genDefinitionsPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
189  virtual void genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
190  virtual void genRunnerPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override;
191  virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override;
192  virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
193  virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
194 
195  virtual void genVariableDefinition(CodeStream &definitions, CodeStream &definitionsInternal, const std::string &type, const std::string &name, VarLocation loc) const override;
196  virtual void genVariableImplementation(CodeStream &os, const std::string &type, const std::string &name, VarLocation loc) const override;
197  virtual void genVariableAllocation(CodeStream &os, const std::string &type, const std::string &name, VarLocation loc, size_t count, MemAlloc &memAlloc) const override;
198  virtual void genVariableFree(CodeStream &os, const std::string &name, VarLocation loc) const override;
199 
200  virtual void genExtraGlobalParamDefinition(CodeStream &definitions, CodeStream &definitionsInternal, const std::string &type, const std::string &name, VarLocation loc) const override;
201  virtual void genExtraGlobalParamImplementation(CodeStream &os, const std::string &type, const std::string &name, VarLocation loc) const override;
202  virtual void genExtraGlobalParamAllocation(CodeStream &os, const std::string &type, const std::string &name,
203  VarLocation loc, const std::string &countVarName = "count", const std::string &prefix = "") const override;
204  virtual void genExtraGlobalParamPush(CodeStream &os, const std::string &type, const std::string &name,
205  VarLocation loc, const std::string &countVarName = "count", const std::string &prefix = "") const override;
206  virtual void genExtraGlobalParamPull(CodeStream &os, const std::string &type, const std::string &name,
207  VarLocation loc, const std::string &countVarName = "count", const std::string &prefix = "") const override;
208 
210  virtual void genMergedExtraGlobalParamPush(CodeStream &os, const std::string &suffix, size_t mergedGroupIdx,
211  const std::string &groupIdx, const std::string &fieldName,
212  const std::string &egpName) const override;
213 
215  virtual std::string getMergedGroupFieldHostType(const std::string &type) const override;
216 
218  virtual std::string getMergedGroupSimRNGType() const override { return "curandState"; }
219 
220  virtual void genVariablePush(CodeStream &os, const std::string &type, const std::string &name, VarLocation loc, bool autoInitialized, size_t count) const override;
221  virtual void genVariablePull(CodeStream &os, const std::string &type, const std::string &name, VarLocation loc, size_t count) const override;
222 
223  virtual void genCurrentVariablePush(CodeStream &os, const NeuronGroupInternal &ng, const std::string &type,
224  const std::string &name, VarLocation loc, unsigned int batchSize) const override;
225  virtual void genCurrentVariablePull(CodeStream &os, const NeuronGroupInternal &ng, const std::string &type,
226  const std::string &name, VarLocation loc, unsigned int batchSize) const override;
227 
228  virtual void genCurrentTrueSpikePush(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize) const override
229  {
230  genCurrentSpikePush(os, ng, batchSize, false);
231  }
232  virtual void genCurrentTrueSpikePull(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize) const override
233  {
234  genCurrentSpikePull(os, ng, batchSize, false);
235  }
236  virtual void genCurrentSpikeLikeEventPush(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize) const override
237  {
238  genCurrentSpikePush(os, ng, batchSize, true);
239  }
240  virtual void genCurrentSpikeLikeEventPull(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize) const override
241  {
242  genCurrentSpikePull(os, ng, batchSize, true);
243  }
244 
245  virtual void genGlobalDeviceRNG(CodeStream &definitions, CodeStream &definitionsInternal,
246  CodeStream &runner, CodeStream &allocations, CodeStream &free,
247  MemAlloc &memAlloc) const override;
248  virtual void genPopulationRNG(CodeStream &definitions, CodeStream &definitionsInternal,
249  CodeStream &runner, CodeStream &allocations, CodeStream &free,
250  const std::string &name, size_t count, MemAlloc &memAlloc) const override;
251  virtual void genTimer(CodeStream &definitions, CodeStream &definitionsInternal, CodeStream &runner,
252  CodeStream &allocations, CodeStream &free, CodeStream &stepTimeFinalise,
253  const std::string &name, bool updateInStepTime) const override;
254 
256  virtual void genReturnFreeDeviceMemoryBytes(CodeStream &os) const override;
257 
258  virtual void genMakefilePreamble(std::ostream &os) const override;
259  virtual void genMakefileLinkRule(std::ostream &os) const override;
260  virtual void genMakefileCompileRule(std::ostream &os) const override;
261 
262  virtual void genMSBuildConfigProperties(std::ostream &os) const override;
263  virtual void genMSBuildImportProps(std::ostream &os) const override;
264  virtual void genMSBuildItemDefinitions(std::ostream &os) const override;
265  virtual void genMSBuildCompileModule(const std::string &moduleName, std::ostream &os) const override;
266  virtual void genMSBuildImportTarget(std::ostream &os) const override;
267 
269  virtual std::string getAllocateMemParams(const ModelSpecMerged &) const override;
270 
272  virtual bool isPopulationRNGInitialisedOnDevice() const override { return true; }
273 
275  virtual bool isHostReductionRequired() const override { return getPreferences<Preferences>().enableNCCLReductions; }
276 
278  virtual size_t getDeviceMemoryBytes() const override{ return m_ChosenDevice.totalGlobalMem; }
279 
283  virtual MemorySpaces getMergedGroupMemorySpaces(const ModelSpecMerged &modelMerged) const override;
284 
285  virtual bool supportsNamespace() const override { return true; };
286 
288  virtual boost::uuids::detail::sha1::digest_type getHashDigest() const override;
289 
290  //--------------------------------------------------------------------------
291  // Public API
292  //--------------------------------------------------------------------------
293  const cudaDeviceProp &getChosenCUDADevice() const{ return m_ChosenDevice; }
294  int getChosenDeviceID() const{ return m_ChosenDeviceID; }
295  int getRuntimeVersion() const{ return m_RuntimeVersion; }
296  std::string getNVCCFlags() const;
297 
298 private:
299  //--------------------------------------------------------------------------
300  // Private methods
301  //--------------------------------------------------------------------------
302  template<typename T>
303  void genMergedStructArrayPush(CodeStream &os, const std::vector<T> &groups) const
304  {
305  // Loop through groups
306  for(const auto &g : groups) {
307  // Check that a memory space has been assigned
308  assert(!g.getMemorySpace().empty());
309 
310  // Implement merged group array in previously assigned memory space
311  os << g.getMemorySpace() << " Merged" << T::name << "Group" << g.getIndex() << " d_merged" << T::name << "Group" << g.getIndex() << "[" << g.getGroups().size() << "];" << std::endl;
312 
313  // Write function to update
314  os << "void pushMerged" << T::name << "Group" << g.getIndex() << "ToDevice(unsigned int idx, ";
315  g.generateStructFieldArgumentDefinitions(os, *this);
316  os << ")";
317  {
318  CodeStream::Scope b(os);
319 
320  // Loop through sorted fields and build struct on the stack
321  os << "Merged" << T::name << "Group" << g.getIndex() << " group = {";
322  const auto sortedFields = g.getSortedFields(*this);
323  for(const auto &f : sortedFields) {
324  os << std::get<1>(f) << ", ";
325  }
326  os << "};" << std::endl;
327 
328  // Push to device
329  os << "CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_merged" << T::name << "Group" << g.getIndex() << ", &group, ";
330  os << "sizeof(Merged" << T::name << "Group" << g.getIndex() << "), idx * sizeof(Merged" << T::name << "Group" << g.getIndex() << ")));" << std::endl;
331  }
332  }
333  }
334 
335 
337  size_t getChosenDeviceSafeConstMemBytes() const
338  {
339  return m_ChosenDevice.totalConstMem - getPreferences<Preferences>().constantCacheOverhead;
340  }
341 
342  void genCurrentSpikePush(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize, bool spikeEvent) const;
343  void genCurrentSpikePull(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize, bool spikeEvent) const;
344 
345  void genKernelDimensions(CodeStream &os, Kernel kernel, size_t numThreadsX, size_t batchSize, size_t numBlockThreadsY = 1) const;
346 
347  //--------------------------------------------------------------------------
348  // Members
349  //--------------------------------------------------------------------------
350  const int m_ChosenDeviceID;
351  cudaDeviceProp m_ChosenDevice;
352  int m_RuntimeVersion;
353 };
354 } // CUDA
355 } // CodeGenerator
Definition: neuronGroupInternal.h:9
Base class for Single Instruction Multiple Thread style backends.
Definition: backendSIMT.h:51
virtual std::string getCLZ() const override
Get the name of the count-leading-zeros function.
Definition: cuda/backend.h:151
VarLocation
< Flags defining which memory space variables should be allocated in
Definition: variableMode.h:10
virtual size_t getDeviceMemoryBytes() const override
How many bytes of memory does &#39;device&#39; have.
Definition: cuda/backend.h:278
int getChosenDeviceID() const
Definition: cuda/backend.h:294
virtual bool supportsNamespace() const override
Does this backend support namespaces i.e. can C++ implementation of support functions be used...
Definition: cuda/backend.h:285
Pick device with most global memory.
virtual std::string getSharedPrefix() const override
Get the prefix to use for shared memory variables.
Definition: cuda/backend.h:142
Pick optimal device based on how well kernels can be simultaneously simulated and occupancy...
BlockSizeSelect
Methods for selecting CUDA kernel block size.
Definition: cuda/backend.h:52
Base class for backend preferences - can be accessed via a global in &#39;classic&#39; C++ code generator...
Definition: backendBase.h:58
int getRuntimeVersion() const
Definition: cuda/backend.h:295
Definition: modelSpecMerged.h:31
Helper class for generating code - automatically inserts brackets, indents etc.
Definition: backendBase.h:30
virtual void genCurrentTrueSpikePush(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize) const override
Generate code for pushing true spikes emitted by a neuron group in the current timestep to the &#39;devic...
Definition: cuda/backend.h:228
Use device specified by user.
std::vector< std::pair< std::string, size_t > > MemorySpaces
Vector of prefixes required to allocate in memory space and size of memory space. ...
Definition: backendBase.h:190
Definition: codeStream.h:21
virtual bool isHostReductionRequired() const override
Backends which support batch-parallelism might require an additional host reduction phase after reduc...
Definition: cuda/backend.h:275
#define BACKEND_EXPORT
Definition: backendExport.h:13
Definition: backendBase.h:107
Use device specified by user at runtime with allocateMem parameter. Optimisation will be performed on...
virtual void genCurrentTrueSpikePull(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize) const override
Generate code for pulling true spikes emitted by a neuron group in the current timestep from the &#39;dev...
Definition: cuda/backend.h:232
Definition: substitutions.h:21
virtual std::string getMergedGroupSimRNGType() const override
When generating merged structures what type to use for simulation RNGs.
Definition: cuda/backend.h:218
virtual void genCurrentSpikeLikeEventPush(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize) const override
Generate code for pushing spike-like events emitted by a neuron group in the current timestep to the ...
Definition: cuda/backend.h:236
void updateHash(const T &value, boost::uuids::detail::sha1 &hash)
Hash arithmetic types and enums.
Definition: gennUtils.h:128
AtomicMemSpace
What memory space atomic operation is required.
Definition: backendSIMT.h:70
std::function< void(CodeStream &)> HostHandler
Definition: backendBase.h:182
Definition: cuda/backend.h:129
Preferences for CUDA backend.
Definition: cuda/backend.h:62
virtual void genCurrentSpikeLikeEventPull(CodeStream &os, const NeuronGroupInternal &ng, unsigned int batchSize) const override
Generate code for pulling spike-like events emitted by a neuron group in the current timestep from th...
Definition: cuda/backend.h:240
Kernel
Kernels generated by SIMT backends.
Definition: backendSIMT.h:24
AtomicOperation
What atomic operation is required.
Definition: backendSIMT.h:63
Preferences()
Definition: cuda/backend.h:64
Definition: generateModules.h:16
DeviceSelect
Methods for selecting CUDA device.
Definition: cuda/backend.h:40
Pick optimal blocksize for each kernel based on occupancy.
KernelBlockSize manualBlockSizes
If block size select method is set to BlockSizeSelect::MANUAL, block size to use for each kernel...
Definition: cuda/backend.h:97
const cudaDeviceProp & getChosenCUDADevice() const
Definition: cuda/backend.h:293
virtual bool isPopulationRNGInitialisedOnDevice() const override
Different backends seed RNGs in different ways. Does this one initialise population RNGS on device...
Definition: cuda/backend.h:272
std::array< size_t, KernelMax > KernelBlockSize
Array of block sizes for each kernel.
Definition: backendSIMT.h:44
void updateHash(boost::uuids::detail::sha1 &hash) const
Definition: cuda/backend.h:108
Definition: codeStream.h:94