HMSBEAGLE  1.0.0
BeagleGPUImpl.hpp
1 
2 /*
3  * BeagleGPUImpl.cpp
4  * BEAGLE
5  *
6  * Copyright 2009 Phylogenetic Likelihood Working Group
7  *
8  * This file is part of BEAGLE.
9  *
10  * BEAGLE is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU Lesser General Public License as
12  * published by the Free Software Foundation, either version 3 of
13  * the License, or (at your option) any later version.
14  *
15  * BEAGLE is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with BEAGLE. If not, see
22  * <http://www.gnu.org/licenses/>.
23  *
24  * @author Marc Suchard
25  * @author Andrew Rambaut
26  * @author Daniel Ayres
27  * @author Aaron Darling
28  */
29 #ifdef HAVE_CONFIG_H
30 #include "libhmsbeagle/config.h"
31 #endif
32 
33 #include <cstdio>
34 #include <cstdlib>
35 #include <cassert>
36 #include <iostream>
37 #include <cstring>
38 
39 #include "libhmsbeagle/beagle.h"
40 #include "libhmsbeagle/GPU/GPUImplDefs.h"
41 #include "libhmsbeagle/GPU/BeagleGPUImpl.h"
42 #include "libhmsbeagle/GPU/GPUImplHelper.h"
43 #include "libhmsbeagle/GPU/KernelLauncher.h"
44 #include "libhmsbeagle/GPU/GPUInterface.h"
45 #include "libhmsbeagle/GPU/Precision.h"
46 
47 namespace beagle {
48 namespace gpu {
49 
50 BEAGLE_GPU_TEMPLATE
51 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::BeagleGPUImpl() {
52 
53  gpu = NULL;
54  kernels = NULL;
55 
56  dIntegrationTmp = (GPUPtr)NULL;
57  dOutFirstDeriv = (GPUPtr)NULL;
58  dOutSecondDeriv = (GPUPtr)NULL;
59  dPartialsTmp = (GPUPtr)NULL;
60  dFirstDerivTmp = (GPUPtr)NULL;
61  dSecondDerivTmp = (GPUPtr)NULL;
62 
63  dSumLogLikelihood = (GPUPtr)NULL;
64  dSumFirstDeriv = (GPUPtr)NULL;
65  dSumSecondDeriv = (GPUPtr)NULL;
66 
67  dPatternWeights = (GPUPtr)NULL;
68 
69  dBranchLengths = (GPUPtr)NULL;
70 
71  dDistanceQueue = (GPUPtr)NULL;
72 
73  dPtrQueue = (GPUPtr)NULL;
74 
75  dMaxScalingFactors = (GPUPtr)NULL;
76  dIndexMaxScalingFactors = (GPUPtr)NULL;
77 
78  dEigenValues = NULL;
79  dEvec = NULL;
80  dIevc = NULL;
81 
82  dWeights = NULL;
83  dFrequencies = NULL;
84 
85  dScalingFactors = NULL;
86 
87  dStates = NULL;
88 
89  dPartials = NULL;
90  dMatrices = NULL;
91 
92  dCompactBuffers = NULL;
93  dTipPartialsBuffers = NULL;
94 
95  hPtrQueue = NULL;
96 
97  hCategoryRates = NULL;
98 
99  hPatternWeightsCache = NULL;
100 
101  hDistanceQueue = NULL;
102 
103  hWeightsCache = NULL;
104  hFrequenciesCache = NULL;
105  hLogLikelihoodsCache = NULL;
106  hPartialsCache = NULL;
107  hStatesCache = NULL;
108  hMatrixCache = NULL;
109 
110  hRescalingTrigger = NULL;
111  dRescalingTrigger = (GPUPtr)NULL;
112  dScalingFactorsMaster = NULL;
113 }
114 
115 BEAGLE_GPU_TEMPLATE
116 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::~BeagleGPUImpl() {
117 
118  if (kInitialized) {
119  for (int i=0; i < kEigenDecompCount; i++) {
120  gpu->FreeMemory(dEigenValues[i]);
121  gpu->FreeMemory(dEvec[i]);
122  gpu->FreeMemory(dIevc[i]);
123  gpu->FreeMemory(dWeights[i]);
124  gpu->FreeMemory(dFrequencies[i]);
125  }
126 
127  gpu->FreeMemory(dMatrices[0]);
128 
129  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
130  gpu->FreePinnedHostMemory(hRescalingTrigger);
131  for (int i = 0; i < kScaleBufferCount; i++) {
132  if (dScalingFactorsMaster[i] != 0)
133  gpu->FreeMemory(dScalingFactorsMaster[i]);
134  }
135  free(dScalingFactorsMaster);
136  } else {
137  if (kScaleBufferCount > 0)
138  gpu->FreeMemory(dScalingFactors[0]);
139  }
140 
141  for (int i = 0; i < kBufferCount; i++) {
142  if (i < kTipCount) { // For the tips
143  if (i < kCompactBufferCount)
144  gpu->FreeMemory(dCompactBuffers[i]);
145  if (i < kTipPartialsBufferCount)
146  gpu->FreeMemory(dTipPartialsBuffers[i]);
147  } else {
148  gpu->FreeMemory(dPartials[i]);
149  }
150  }
151 
152  gpu->FreeMemory(dIntegrationTmp);
153  gpu->FreeMemory(dOutFirstDeriv);
154  gpu->FreeMemory(dOutSecondDeriv);
155  gpu->FreeMemory(dPartialsTmp);
156  gpu->FreeMemory(dFirstDerivTmp);
157  gpu->FreeMemory(dSecondDerivTmp);
158 
159  gpu->FreeMemory(dSumLogLikelihood);
160  gpu->FreeMemory(dSumFirstDeriv);
161  gpu->FreeMemory(dSumSecondDeriv);
162 
163  gpu->FreeMemory(dPatternWeights);
164 
165  gpu->FreeMemory(dBranchLengths);
166 
167  gpu->FreeMemory(dDistanceQueue);
168 
169  gpu->FreeMemory(dPtrQueue);
170 
171  gpu->FreeMemory(dMaxScalingFactors);
172  gpu->FreeMemory(dIndexMaxScalingFactors);
173 
174  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
175  gpu->FreeMemory(dAccumulatedScalingFactors);
176  }
177 
178  free(dEigenValues);
179  free(dEvec);
180  free(dIevc);
181 
182  free(dWeights);
183  free(dFrequencies);
184 
185  free(dScalingFactors);
186 
187  free(dStates);
188 
189  free(dPartials);
190  free(dMatrices);
191 
192  free(dCompactBuffers);
193  free(dTipPartialsBuffers);
194 
195  gpu->FreeHostMemory(hPtrQueue);
196 
197  gpu->FreeHostMemory(hCategoryRates);
198 
199  gpu->FreeHostMemory(hPatternWeightsCache);
200 
201  gpu->FreeHostMemory(hDistanceQueue);
202 
203  gpu->FreeHostMemory(hWeightsCache);
204  gpu->FreeHostMemory(hFrequenciesCache);
205  gpu->FreeHostMemory(hPartialsCache);
206  gpu->FreeHostMemory(hStatesCache);
207 
208  gpu->FreeHostMemory(hLogLikelihoodsCache);
209  gpu->FreeHostMemory(hMatrixCache);
210 
211  }
212 
213  if (kernels)
214  delete kernels;
215  if (gpu)
216  delete gpu;
217 
218 }
219 
220 BEAGLE_GPU_TEMPLATE
221 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::createInstance(int tipCount,
222  int partialsBufferCount,
223  int compactBufferCount,
224  int stateCount,
225  int patternCount,
226  int eigenDecompositionCount,
227  int matrixCount,
228  int categoryCount,
229  int scaleBufferCount,
230  int iResourceNumber,
231  long preferenceFlags,
232  long requirementFlags) {
233 
234 #ifdef BEAGLE_DEBUG_FLOW
235  fprintf(stderr, "\tEntering BeagleGPUImpl::createInstance\n");
236 #endif
237 
238  kInitialized = 0;
239 
240  kTipCount = tipCount;
241  kPartialsBufferCount = partialsBufferCount;
242  kCompactBufferCount = compactBufferCount;
243  kStateCount = stateCount;
244  kPatternCount = patternCount;
245  kEigenDecompCount = eigenDecompositionCount;
246  kMatrixCount = matrixCount;
247  kCategoryCount = categoryCount;
248  kScaleBufferCount = scaleBufferCount;
249 
250  resourceNumber = iResourceNumber;
251 
252  kTipPartialsBufferCount = kTipCount - kCompactBufferCount;
253  kBufferCount = kPartialsBufferCount + kCompactBufferCount;
254 
255  kInternalPartialsBufferCount = kBufferCount - kTipCount;
256 
257  if (kStateCount <= 4)
258  kPaddedStateCount = 4;
259  else if (kStateCount <= 16)
260  kPaddedStateCount = 16;
261  else if (kStateCount <= 32)
262  kPaddedStateCount = 32;
263  else if (kStateCount <= 48)
264  kPaddedStateCount = 48;
265  else if (kStateCount <= 64)
266  kPaddedStateCount = 64;
267  else if (kStateCount <= 80)
268  kPaddedStateCount = 80;
269  else if (kStateCount <= 128)
270  kPaddedStateCount = 128;
271  else if (kStateCount <= 192)
272  kPaddedStateCount = 192;
273  else
274  kPaddedStateCount = kStateCount + kStateCount % 16;
275 
276  // Make sure that kPaddedPatternCount + paddedPatterns is multiple of 4 for DNA model
277  int paddedPatterns = 0;
278  if (kPaddedStateCount == 4 && kPatternCount % 4 != 0)
279  paddedPatterns = 4 - kPatternCount % 4;
280 
281  // TODO Should do something similar for 4 < kStateCount <= 8 as well
282 
283  kPaddedPatternCount = kPatternCount + paddedPatterns;
284 
285  int resultPaddedPatterns = 0;
286  int patternBlockSizeFour = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? PATTERN_BLOCK_SIZE_DP_4 : PATTERN_BLOCK_SIZE_SP_4);
287  if (kPaddedStateCount == 4 && kPaddedPatternCount % patternBlockSizeFour != 0)
288  resultPaddedPatterns = patternBlockSizeFour - kPaddedPatternCount % patternBlockSizeFour;
289 
290  kScaleBufferSize = kPaddedPatternCount;
291 
292  kFlags = 0;
293 
294  if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) {
295  kFlags |= BEAGLE_FLAG_SCALING_AUTO;
296  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
297  kScaleBufferCount = kInternalPartialsBufferCount;
298  kScaleBufferSize *= kCategoryCount;
299  } else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
300  kFlags |= BEAGLE_FLAG_SCALING_ALWAYS;
301  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
302  kScaleBufferCount = kInternalPartialsBufferCount + 1; // +1 for temp buffer used by edgelikelihood
303  } else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
304  kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC;
305  kFlags |= BEAGLE_FLAG_SCALERS_RAW;
306  } else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) {
307  kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
308  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
309  } else {
310  kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
311  kFlags |= BEAGLE_FLAG_SCALERS_RAW;
312  }
313 
314  if (preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX || requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
315  kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX;
316  } else {
317  kFlags |= BEAGLE_FLAG_EIGEN_REAL;
318  }
319 
320  if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED)
321  kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED;
322  else
323  kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD;
324 
325  Real r = 0;
326  modifyFlagsForPrecision(&kFlags, r);
327 
328  int sumSitesBlockSize = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? SUM_SITES_BLOCK_SIZE_DP : SUM_SITES_BLOCK_SIZE_SP);
329  kSumSitesBlockCount = kPatternCount / sumSitesBlockSize;
330  if (kPatternCount % sumSitesBlockSize != 0)
331  kSumSitesBlockCount += 1;
332 
333  kPartialsSize = kPaddedPatternCount * kPaddedStateCount * kCategoryCount;
334  kMatrixSize = kPaddedStateCount * kPaddedStateCount;
335 
336  if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
337  kEigenValuesSize = 2 * kPaddedStateCount;
338  else
339  kEigenValuesSize = kPaddedStateCount;
340 
341  kLastCompactBufferIndex = -1;
342  kLastTipPartialsBufferIndex = -1;
343 
344  gpu = new GPUInterface();
345 
346  gpu->Initialize();
347 
348  int numDevices = 0;
349  numDevices = gpu->GetDeviceCount();
350  if (numDevices == 0) {
351  fprintf(stderr, "Error: No GPU devices\n");
352  return BEAGLE_ERROR_NO_RESOURCE;
353  }
354  if (resourceNumber > numDevices) {
355  fprintf(stderr,"Error: Trying to initialize device # %d (which does not exist)\n",resourceNumber);
356  return BEAGLE_ERROR_NO_RESOURCE;
357  }
358 
359  // TODO: recompiling kernels for every instance, probably not ideal
360  gpu->SetDevice(resourceNumber-1,kPaddedStateCount,kCategoryCount,kPaddedPatternCount,kFlags);
361 
362  int ptrQueueLength = kMatrixCount * kCategoryCount * 3;
363  if (kPartialsBufferCount > ptrQueueLength)
364  ptrQueueLength = kPartialsBufferCount;
365 
366  unsigned int neededMemory = sizeof(Real) * (kMatrixSize * kEigenDecompCount + // dEvec
367  kMatrixSize * kEigenDecompCount + // dIevc
368  kEigenValuesSize * kEigenDecompCount + // dEigenValues
369  kCategoryCount * kPartialsBufferCount + // dWeights
370  kPaddedStateCount * kPartialsBufferCount + // dFrequencies
371  kPaddedPatternCount + // dIntegrationTmp
372  kPaddedPatternCount + // dOutFirstDeriv
373  kPaddedPatternCount + // dOutSecondDeriv
374  kPartialsSize + // dPartialsTmp
375  kPartialsSize + // dFirstDerivTmp
376  kPartialsSize + // dSecondDerivTmp
377  kScaleBufferCount * kPaddedPatternCount + // dScalingFactors
378  kPartialsBufferCount * kPartialsSize + // dTipPartialsBuffers + dPartials
379  kMatrixCount * kMatrixSize * kCategoryCount + // dMatrices
380  kBufferCount + // dBranchLengths
381  kMatrixCount * kCategoryCount * 2) + // dDistanceQueue
382  sizeof(int) * kCompactBufferCount * kPaddedPatternCount + // dCompactBuffers
383  sizeof(GPUPtr) * ptrQueueLength; // dPtrQueue
384 
385 
386  unsigned int availableMem = gpu->GetAvailableMemory();
387 
388 #ifdef BEAGLE_DEBUG_VALUES
389  fprintf(stderr, " needed memory: %d\n", neededMemory);
390  fprintf(stderr, " available memory: %d\n", availableMem);
391 #endif
392 
393  if (availableMem < neededMemory)
394  return BEAGLE_ERROR_OUT_OF_MEMORY;
395 
396  kernels = new KernelLauncher(gpu);
397 
398  // TODO: only allocate if necessary on the fly
399  hWeightsCache = (Real*) gpu->CallocHost(kCategoryCount * kPartialsBufferCount, sizeof(Real));
400  hFrequenciesCache = (Real*) gpu->CallocHost(kPaddedStateCount * kPartialsBufferCount, sizeof(Real));
401  hPartialsCache = (Real*) gpu->CallocHost(kPartialsSize, sizeof(Real));
402  hStatesCache = (int*) gpu->CallocHost(kPaddedPatternCount, sizeof(int));
403 
404  int hMatrixCacheSize = kMatrixSize * kCategoryCount * BEAGLE_CACHED_MATRICES_COUNT;
405  if ((2 * kMatrixSize + kEigenValuesSize) > hMatrixCacheSize)
406  hMatrixCacheSize = 2 * kMatrixSize + kEigenValuesSize;
407 
408  hLogLikelihoodsCache = (Real*) gpu->MallocHost(kPatternCount * sizeof(Real));
409  hMatrixCache = (Real*) gpu->CallocHost(hMatrixCacheSize, sizeof(Real));
410 
411  dEvec = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
412  dIevc = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
413  dEigenValues = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
414  dWeights = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
415  dFrequencies = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
416 
417  dMatrices = (GPUPtr*) malloc(sizeof(GPUPtr) * kMatrixCount);
418  dMatrices[0] = gpu->AllocateMemory(kMatrixCount * kMatrixSize * kCategoryCount * sizeof(Real));
419 
420  for (int i = 1; i < kMatrixCount; i++) {
421  dMatrices[i] = dMatrices[i-1] + kMatrixSize * kCategoryCount * sizeof(Real);
422  }
423 
424  if (kScaleBufferCount > 0) {
425  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
426  dScalingFactors = (GPUPtr*) malloc(sizeof(GPUPtr) * kScaleBufferCount);
427  dScalingFactors[0] = gpu->AllocateMemory(sizeof(signed char) * kScaleBufferSize * kScaleBufferCount); // TODO: char won't work for double-precision
428  for (int i=1; i < kScaleBufferCount; i++)
429  dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize * sizeof(signed char);
430  } else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
431  dScalingFactors = (GPUPtr*) calloc(sizeof(GPUPtr), kScaleBufferCount);
432  dScalingFactorsMaster = (GPUPtr*) calloc(sizeof(GPUPtr), kScaleBufferCount);
433  hRescalingTrigger = (int*) gpu->AllocatePinnedHostMemory(sizeof(int), false, true);
434  dRescalingTrigger = gpu->GetDevicePointer((void*) hRescalingTrigger);
435  } else {
436  dScalingFactors = (GPUPtr*) malloc(sizeof(GPUPtr) * kScaleBufferCount);
437  dScalingFactors[0] = gpu->AllocateMemory(kScaleBufferSize * kScaleBufferCount * sizeof(Real));
438  for (int i=1; i < kScaleBufferCount; i++) {
439  dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize * sizeof(Real);
440  }
441  }
442  }
443 
444  for(int i=0; i<kEigenDecompCount; i++) {
445  dEvec[i] = gpu->AllocateMemory(kMatrixSize * sizeof(Real));
446  dIevc[i] = gpu->AllocateMemory(kMatrixSize * sizeof(Real));
447  dEigenValues[i] = gpu->AllocateMemory(kEigenValuesSize * sizeof(Real));
448  dWeights[i] = gpu->AllocateMemory(kCategoryCount * sizeof(Real));
449  dFrequencies[i] = gpu->AllocateMemory(kPaddedStateCount * sizeof(Real));
450  }
451 
452 
453  dIntegrationTmp = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real));
454  dOutFirstDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real));
455  dOutSecondDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real));
456 
457  dPatternWeights = gpu->AllocateMemory(kPatternCount * sizeof(Real));
458 
459  dSumLogLikelihood = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real));
460  dSumFirstDeriv = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real));
461  dSumSecondDeriv = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real));
462 
463  dPartialsTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
464  dFirstDerivTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
465  dSecondDerivTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
466 
467  // Fill with 0s so 'free' does not choke if unallocated
468  dPartials = (GPUPtr*) calloc(sizeof(GPUPtr), kBufferCount);
469 
470  // Internal nodes have 0s so partials are used
471  dStates = (GPUPtr*) calloc(sizeof(GPUPtr), kBufferCount);
472 
473  dCompactBuffers = (GPUPtr*) malloc(sizeof(GPUPtr) * kCompactBufferCount);
474  dTipPartialsBuffers = (GPUPtr*) malloc(sizeof(GPUPtr) * kTipPartialsBufferCount);
475 
476  for (int i = 0; i < kBufferCount; i++) {
477  if (i < kTipCount) { // For the tips
478  if (i < kCompactBufferCount)
479  dCompactBuffers[i] = gpu->AllocateMemory(kPaddedPatternCount * sizeof(int));
480  if (i < kTipPartialsBufferCount)
481  dTipPartialsBuffers[i] = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
482  } else {
483  dPartials[i] = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
484  }
485  }
486 
487  kLastCompactBufferIndex = kCompactBufferCount - 1;
488  kLastTipPartialsBufferIndex = kTipPartialsBufferCount - 1;
489 
490  // No execution has more no kBufferCount events
491  dBranchLengths = gpu->AllocateMemory(kBufferCount * sizeof(Real));
492 
493  dDistanceQueue = gpu->AllocateMemory(kMatrixCount * kCategoryCount * 2 * sizeof(Real));
494  hDistanceQueue = (Real*) gpu->MallocHost(sizeof(Real) * kMatrixCount * kCategoryCount * 2);
495  checkHostMemory(hDistanceQueue);
496 
497  dPtrQueue = gpu->AllocateMemory(sizeof(unsigned int) * ptrQueueLength);
498  hPtrQueue = (unsigned int*) gpu->MallocHost(sizeof(unsigned int) * ptrQueueLength);
499  checkHostMemory(hPtrQueue);
500 
501  hCategoryRates = (double*) gpu->MallocHost(sizeof(double) * kCategoryCount); // Keep in double-precision
502  checkHostMemory(hCategoryRates);
503 
504  hPatternWeightsCache = (Real*) gpu->MallocHost(sizeof(double) * kPatternCount);
505  checkHostMemory(hPatternWeightsCache);
506 
507  dMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount * sizeof(Real));
508  dIndexMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount * sizeof(int));
509 
510  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
511  dAccumulatedScalingFactors = gpu->AllocateMemory(sizeof(int) * kScaleBufferSize);
512  }
513 
514 #ifdef BEAGLE_DEBUG_FLOW
515  fprintf(stderr, "\tLeaving BeagleGPUImpl::createInstance\n");
516 #endif
517 
518  kInitialized = 1;
519 
520 #ifdef BEAGLE_DEBUG_VALUES
521  gpu->Synchronize();
522  int usedMemory = availableMem - gpu->GetAvailableMemory();
523  fprintf(stderr, "actual used memory: %d\n", usedMemory);
524  fprintf(stderr, " difference: %d\n\n", usedMemory-neededMemory);
525 #endif
526 
527  return BEAGLE_SUCCESS;
528 }
529 
530 template<>
531 char* BeagleGPUImpl<double>::getInstanceName() {
532  return (char*) "CUDA-Double";
533 }
534 
535 template<>
536 char* BeagleGPUImpl<float>::getInstanceName() {
537  return (char*) "CUDA-Single";
538 }
539 
540 BEAGLE_GPU_TEMPLATE
541 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getInstanceDetails(BeagleInstanceDetails* returnInfo) {
542  if (returnInfo != NULL) {
543  returnInfo->resourceNumber = resourceNumber;
544  returnInfo->flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
545  BEAGLE_FLAG_THREADING_NONE |
546  BEAGLE_FLAG_VECTOR_NONE |
547  BEAGLE_FLAG_PROCESSOR_GPU;
548  Real r = 0;
549  modifyFlagsForPrecision(&(returnInfo->flags), r);
550 
551  returnInfo->flags |= kFlags;
552 
553  returnInfo->implName = getInstanceName();
554  }
555  return BEAGLE_SUCCESS;
556 }
557 
558 BEAGLE_GPU_TEMPLATE
559 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipStates(int tipIndex,
560  const int* inStates) {
561 
562 #ifdef BEAGLE_DEBUG_FLOW
563  fprintf(stderr, "\tEntering BeagleGPUImpl::setTipStates\n");
564 #endif
565 
566  if (tipIndex < 0 || tipIndex >= kTipCount)
567  return BEAGLE_ERROR_OUT_OF_RANGE;
568 
569  for(int i = 0; i < kPatternCount; i++)
570  hStatesCache[i] = (inStates[i] < kStateCount ? inStates[i] : kPaddedStateCount);
571 
572  // Padded extra patterns
573  for(int i = kPatternCount; i < kPaddedPatternCount; i++)
574  hStatesCache[i] = kPaddedStateCount;
575 
576  if (dStates[tipIndex] == 0) {
577  assert(kLastCompactBufferIndex >= 0 && kLastCompactBufferIndex < kCompactBufferCount);
578  dStates[tipIndex] = dCompactBuffers[kLastCompactBufferIndex--];
579  }
580  // Copy to GPU device
581  gpu->MemcpyHostToDevice(dStates[tipIndex], hStatesCache, sizeof(int) * kPaddedPatternCount);
582 
583 #ifdef BEAGLE_DEBUG_FLOW
584  fprintf(stderr, "\tLeaving BeagleGPUImpl::setTipStates\n");
585 #endif
586 
587  return BEAGLE_SUCCESS;
588 }
589 
590 BEAGLE_GPU_TEMPLATE
591 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipPartials(int tipIndex,
592  const double* inPartials) {
593 #ifdef BEAGLE_DEBUG_FLOW
594  fprintf(stderr, "\tEntering BeagleGPUImpl::setTipPartials\n");
595 #endif
596 
597  if (tipIndex < 0 || tipIndex >= kTipCount)
598  return BEAGLE_ERROR_OUT_OF_RANGE;
599 
600  const double* inPartialsOffset = inPartials;
601  Real* tmpRealPartialsOffset = hPartialsCache;
602  for (int i = 0; i < kPatternCount; i++) {
603 //#ifdef DOUBLE_PRECISION
604 // memcpy(tmpRealPartialsOffset, inPartialsOffset, sizeof(Real) * kStateCount);
605 //#else
606 // MEMCNV(tmpRealPartialsOffset, inPartialsOffset, kStateCount, Real);
607 //#endif
608  beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
609  tmpRealPartialsOffset += kPaddedStateCount;
610  inPartialsOffset += kStateCount;
611  }
612 
613  int partialsLength = kPaddedPatternCount * kPaddedStateCount;
614  for (int i = 1; i < kCategoryCount; i++) {
615  memcpy(hPartialsCache + i * partialsLength, hPartialsCache, partialsLength * sizeof(Real));
616  }
617 
618  if (tipIndex < kTipCount) {
619  if (dPartials[tipIndex] == 0) {
620  assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
621  kTipPartialsBufferCount);
622  dPartials[tipIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
623  }
624  }
625  // Copy to GPU device
626  gpu->MemcpyHostToDevice(dPartials[tipIndex], hPartialsCache, sizeof(Real) * kPartialsSize);
627 
628 #ifdef BEAGLE_DEBUG_FLOW
629  fprintf(stderr, "\tLeaving BeagleGPUImpl::setTipPartials\n");
630 #endif
631 
632  return BEAGLE_SUCCESS;
633 }
634 
635 BEAGLE_GPU_TEMPLATE
636 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPartials(int bufferIndex,
637  const double* inPartials) {
638 #ifdef BEAGLE_DEBUG_FLOW
639  fprintf(stderr, "\tEntering BeagleGPUImpl::setPartials\n");
640 #endif
641 
642  if (bufferIndex < 0 || bufferIndex >= kPartialsBufferCount)
643  return BEAGLE_ERROR_OUT_OF_RANGE;
644 
645  const double* inPartialsOffset = inPartials;
646  Real* tmpRealPartialsOffset = hPartialsCache;
647  for (int l = 0; l < kCategoryCount; l++) {
648  for (int i = 0; i < kPatternCount; i++) {
649 //#ifdef DOUBLE_PRECISION
650 // memcpy(tmpRealPartialsOffset, inPartialsOffset, sizeof(Real) * kStateCount);
651 //#else
652 // MEMCNV(tmpRealPartialsOffset, inPartialsOffset, kStateCount, Real);
653 //#endif
654  beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
655  tmpRealPartialsOffset += kPaddedStateCount;
656  inPartialsOffset += kStateCount;
657  }
658  tmpRealPartialsOffset += kPaddedStateCount * (kPaddedPatternCount - kPatternCount);
659  }
660 
661  if (bufferIndex < kTipCount) {
662  if (dPartials[bufferIndex] == 0) {
663  assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
664  kTipPartialsBufferCount);
665  dPartials[bufferIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
666  }
667  }
668  // Copy to GPU device
669  gpu->MemcpyHostToDevice(dPartials[bufferIndex], hPartialsCache, sizeof(Real) * kPartialsSize);
670 
671 #ifdef BEAGLE_DEBUG_FLOW
672  fprintf(stderr, "\tLeaving BeagleGPUImpl::setPartials\n");
673 #endif
674 
675  return BEAGLE_SUCCESS;
676 }
677 
678 BEAGLE_GPU_TEMPLATE
679 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getPartials(int bufferIndex,
680  int scaleIndex,
681  double* outPartials) {
682 #ifdef BEAGLE_DEBUG_FLOW
683  fprintf(stderr, "\tEntering BeagleGPUImpl::getPartials\n");
684 #endif
685 
686  gpu->MemcpyDeviceToHost(hPartialsCache, dPartials[bufferIndex], sizeof(Real) * kPartialsSize);
687 
688  double* outPartialsOffset = outPartials;
689  Real* tmpRealPartialsOffset = hPartialsCache;
690 
691  for (int i = 0; i < kPatternCount; i++) {
692 //#ifdef DOUBLE_PRECISION
693 // memcpy(outPartialsOffset, tmpRealPartialsOffset, sizeof(Real) * kStateCount);
694 //#else
695 // MEMCNV(outPartialsOffset, tmpRealPartialsOffset, kStateCount, double);
696 //#endif
697  beagleMemCpy(outPartialsOffset, tmpRealPartialsOffset, kStateCount);
698  tmpRealPartialsOffset += kPaddedStateCount;
699  outPartialsOffset += kStateCount;
700  }
701 
702 #ifdef BEAGLE_DEBUG_FLOW
703  fprintf(stderr, "\tLeaving BeagleGPUImpl::getPartials\n");
704 #endif
705 
706  return BEAGLE_SUCCESS;
707 }
708 
709 BEAGLE_GPU_TEMPLATE
710 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setEigenDecomposition(int eigenIndex,
711  const double* inEigenVectors,
712  const double* inInverseEigenVectors,
713  const double* inEigenValues) {
714 
715 #ifdef BEAGLE_DEBUG_FLOW
716  fprintf(stderr,"\tEntering BeagleGPUImpl::setEigenDecomposition\n");
717 #endif
718 
719  // Native memory packing order (length): Ievc (state^2), Evec (state^2),
720  // Eval (state), EvalImag (state)
721 
722  Real* Ievc, * tmpIevc, * Evec, * tmpEvec, * Eval;
723 
724  tmpIevc = Ievc = (Real*) hMatrixCache;
725  tmpEvec = Evec = Ievc + kMatrixSize;
726  Eval = Evec + kMatrixSize;
727 
728  for (int i = 0; i < kStateCount; i++) {
729 //#ifdef DOUBLE_PRECISION
730 // memcpy(tmpIevc, inInverseEigenVectors + i * kStateCount, sizeof(Real) * kStateCount);
731 // memcpy(tmpEvec, inEigenVectors + i * kStateCount, sizeof(Real) * kStateCount);
732 //#else
733 // MEMCNV(tmpIevc, (inInverseEigenVectors + i * kStateCount), kStateCount, Real);
734 // MEMCNV(tmpEvec, (inEigenVectors + i * kStateCount), kStateCount, Real);
735 //#endif
736  beagleMemCpy(tmpIevc, inInverseEigenVectors + i * kStateCount, kStateCount);
737  beagleMemCpy(tmpEvec, inEigenVectors + i * kStateCount, kStateCount);
738  tmpIevc += kPaddedStateCount;
739  tmpEvec += kPaddedStateCount;
740  }
741 
742  // Transposing matrices avoids incoherent memory read/writes
743  // TODO: Only need to tranpose sub-matrix of trueStateCount
744  if (kFlags & BEAGLE_FLAG_INVEVEC_STANDARD)
745  transposeSquareMatrix(Ievc, kPaddedStateCount);
746  transposeSquareMatrix(Evec, kPaddedStateCount);
747 
748 //#ifdef DOUBLE_PRECISION
749 // memcpy(Eval, inEigenValues, sizeof(Real) * kStateCount);
750 // if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
751 // memcpy(Eval+kPaddedStateCount,inEigenValues+kStateCount,sizeof(Real)*kStateCount);
752 //#else
753 // MEMCNV(Eval, inEigenValues, kStateCount, Real);
754 // if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
755 // MEMCNV((Eval+kPaddedStateCount),(inEigenValues+kStateCount),kStateCount,Real);
756 //#endif
757  beagleMemCpy(Eval, inEigenValues, kStateCount);
758  if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
759  beagleMemCpy(Eval + kPaddedStateCount, inEigenValues + kStateCount, kStateCount);
760  }
761 
762 #ifdef BEAGLE_DEBUG_VALUES
763 //#ifdef DOUBLE_PRECISION
764 // fprintf(stderr, "Eval:\n");
765 // printfVectorD(Eval, kEigenValuesSize);
766 // fprintf(stderr, "Evec:\n");
767 // printfVectorD(Evec, kMatrixSize);
768 // fprintf(stderr, "Ievc:\n");
769 // printfVectorD(Ievc, kPaddedStateCount * kPaddedStateCount);
770 //#else
771  fprintf(stderr, "Eval =\n");
772  printfVector(Eval, kEigenValuesSize);
773  fprintf(stderr, "Evec =\n");
774  printfVector(Evec, kMatrixSize);
775  fprintf(stderr, "Ievc =\n");
776  printfVector(Ievc, kPaddedStateCount * kPaddedStateCount);
777 //#endif
778 #endif
779 
780  // Copy to GPU device
781  gpu->MemcpyHostToDevice(dIevc[eigenIndex], Ievc, sizeof(Real) * kMatrixSize);
782  gpu->MemcpyHostToDevice(dEvec[eigenIndex], Evec, sizeof(Real) * kMatrixSize);
783  gpu->MemcpyHostToDevice(dEigenValues[eigenIndex], Eval, sizeof(Real) * kEigenValuesSize);
784 
785 #ifdef BEAGLE_DEBUG_VALUES
786  Real r = 0;
787  fprintf(stderr, "dEigenValues =\n");
788  gpu->PrintfDeviceVector(dEigenValues[eigenIndex], kEigenValuesSize, r);
789  fprintf(stderr, "dEvec =\n");
790  gpu->PrintfDeviceVector(dEvec[eigenIndex], kMatrixSize, r);
791  fprintf(stderr, "dIevc =\n");
792  gpu->PrintfDeviceVector(dIevc[eigenIndex], kPaddedStateCount * kPaddedStateCount, r);
793 #endif
794 
795 #ifdef BEAGLE_DEBUG_FLOW
796  fprintf(stderr, "\tLeaving BeagleGPUImpl::setEigenDecomposition\n");
797 #endif
798 
799  return BEAGLE_SUCCESS;
800 }
801 
802 BEAGLE_GPU_TEMPLATE
803 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setStateFrequencies(int stateFrequenciesIndex,
804  const double* inStateFrequencies) {
805 #ifdef BEAGLE_DEBUG_FLOW
806  fprintf(stderr, "\tEntering BeagleGPUImpl::setStateFrequencies\n");
807 #endif
808 
809  if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount)
810  return BEAGLE_ERROR_OUT_OF_RANGE;
811 
812 //#ifdef DOUBLE_PRECISION
813 // memcpy(hFrequenciesCache, inStateFrequencies, kStateCount * sizeof(Real));
814 //#else
815 // MEMCNV(hFrequenciesCache, inStateFrequencies, kStateCount, Real);
816 //#endif
817  beagleMemCpy(hFrequenciesCache, inStateFrequencies, kStateCount);
818 
819  gpu->MemcpyHostToDevice(dFrequencies[stateFrequenciesIndex], hFrequenciesCache,
820  sizeof(Real) * kPaddedStateCount);
821 
822 #ifdef BEAGLE_DEBUG_FLOW
823  fprintf(stderr, "\tLeaving BeagleGPUImpl::setStateFrequencies\n");
824 #endif
825 
826  return BEAGLE_SUCCESS;
827 }
828 
829 BEAGLE_GPU_TEMPLATE
830 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryWeights(int categoryWeightsIndex,
831  const double* inCategoryWeights) {
832 #ifdef BEAGLE_DEBUG_FLOW
833  fprintf(stderr, "\tEntering BeagleGPUImpl::setCategoryWeights\n");
834 #endif
835 
836  if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount)
837  return BEAGLE_ERROR_OUT_OF_RANGE;
838 
839 //#ifdef DOUBLE_PRECISION
840 // const double* tmpWeights = inCategoryWeights;
841 //#else
842 // Real* tmpWeights = hWeightsCache;
843 // MEMCNV(hWeightsCache, inCategoryWeights, kCategoryCount, Real);
844 //#endif
845  const Real* tmpWeights = beagleCastIfNecessary(inCategoryWeights, hWeightsCache,
846  kCategoryCount);
847 
848  gpu->MemcpyHostToDevice(dWeights[categoryWeightsIndex], tmpWeights,
849  sizeof(Real) * kCategoryCount);
850 
851 #ifdef BEAGLE_DEBUG_FLOW
852  fprintf(stderr, "\tLeaving BeagleGPUImpl::setCategoryWeights\n");
853 #endif
854 
855  return BEAGLE_SUCCESS;
856 }
857 
858 BEAGLE_GPU_TEMPLATE
859 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryRates(const double* inCategoryRates) {
860 
861 #ifdef BEAGLE_DEBUG_FLOW
862  fprintf(stderr, "\tEntering BeagleGPUImpl::updateCategoryRates\n");
863 #endif
864 
865  const double* categoryRates = inCategoryRates;
866  // Can keep these in double-precision until after multiplication by (double) branch-length
867 
868  memcpy(hCategoryRates, categoryRates, sizeof(double) * kCategoryCount);
869 
870 #ifdef BEAGLE_DEBUG_FLOW
871  fprintf(stderr, "\tLeaving BeagleGPUImpl::updateCategoryRates\n");
872 #endif
873 
874  return BEAGLE_SUCCESS;
875 }
876 
877 BEAGLE_GPU_TEMPLATE
878 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPatternWeights(const double* inPatternWeights) {
879 
880 #ifdef BEAGLE_DEBUG_FLOW
881  fprintf(stderr, "\tEntering BeagleGPUImpl::setPatternWeights\n");
882 #endif
883 
884 //#ifdef DOUBLE_PRECISION
885 // const double* tmpWeights = inPatternWeights;
886 //#else
887 // Real* tmpWeights = hPatternWeightsCache;
888 // MEMCNV(hPatternWeightsCache, inPatternWeights, kPatternCount, Real);
889 //#endif
890  const Real* tmpWeights = beagleCastIfNecessary(inPatternWeights, hPatternWeightsCache, kPatternCount);
891 
892  gpu->MemcpyHostToDevice(dPatternWeights, tmpWeights,
893  sizeof(Real) * kPatternCount);
894 
895 #ifdef BEAGLE_DEBUG_FLOW
896  fprintf(stderr, "\tLeaving BeagleGPUImpl::setPatternWeights\n");
897 #endif
898 
899  return BEAGLE_SUCCESS;
900 }
901 
902 BEAGLE_GPU_TEMPLATE
903 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getTransitionMatrix(int matrixIndex,
904  double* outMatrix) {
905 #ifdef BEAGLE_DEBUG_FLOW
906  fprintf(stderr, "\tEntering BeagleGPUImpl::getTransitionMatrix\n");
907 #endif
908 
909  gpu->MemcpyDeviceToHost(hMatrixCache, dMatrices[matrixIndex], sizeof(Real) * kMatrixSize * kCategoryCount);
910 
911  double* outMatrixOffset = outMatrix;
912  Real* tmpRealMatrixOffset = hMatrixCache;
913 
914  for (int l = 0; l < kCategoryCount; l++) {
915 
916  transposeSquareMatrix(tmpRealMatrixOffset, kPaddedStateCount);
917 
918  for (int i = 0; i < kStateCount; i++) {
919 //#ifdef DOUBLE_PRECISION
920 // memcpy(outMatrixOffset, tmpRealMatrixOffset, sizeof(Real) * kStateCount);
921 //#else
922 // MEMCNV(outMatrixOffset, tmpRealMatrixOffset, kStateCount, double);
923 //#endif
924  beagleMemCpy(outMatrixOffset, tmpRealMatrixOffset, kStateCount);
925  tmpRealMatrixOffset += kPaddedStateCount;
926  outMatrixOffset += kStateCount;
927  }
928  tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
929  }
930 
931 #ifdef BEAGLE_DEBUG_FLOW
932  fprintf(stderr, "\tLeaving BeagleGPUImpl::getTransitionMatrix\n");
933 #endif
934 
935  return BEAGLE_SUCCESS;
936 }
937 
938 BEAGLE_GPU_TEMPLATE
939 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrix(int matrixIndex,
940  const double* inMatrix,
941  double paddedValue) {
942 
943 #ifdef BEAGLE_DEBUG_FLOW
944  fprintf(stderr, "\tEntering BeagleGPUImpl::setTransitionMatrix\n");
945 #endif
946 
947  const double* inMatrixOffset = inMatrix;
948  Real* tmpRealMatrixOffset = hMatrixCache;
949 
950  for (int l = 0; l < kCategoryCount; l++) {
951  Real* transposeOffset = tmpRealMatrixOffset;
952 
953  for (int i = 0; i < kStateCount; i++) {
954 //#ifdef DOUBLE_PRECISION
955 // memcpy(tmpRealMatrixOffset, inMatrixOffset, sizeof(Real) * kStateCount);
956 //#else
957 // MEMCNV(tmpRealMatrixOffset, inMatrixOffset, kStateCount, Real);
958 //#endif
959  beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
960  tmpRealMatrixOffset += kPaddedStateCount;
961  inMatrixOffset += kStateCount;
962  }
963 
964  transposeSquareMatrix(transposeOffset, kPaddedStateCount);
965  tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
966  }
967 
968  // Copy to GPU device
969  gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
970  sizeof(Real) * kMatrixSize * kCategoryCount);
971 
972 #ifdef BEAGLE_DEBUG_FLOW
973  fprintf(stderr, "\tLeaving BeagleGPUImpl::setTransitionMatrix\n");
974 #endif
975 
976  return BEAGLE_SUCCESS;
977 }
978 
979 BEAGLE_GPU_TEMPLATE
980 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrices(const int* matrixIndices,
981  const double* inMatrices,
982  const double* paddedValues,
983  int count) {
984 
985 #ifdef BEAGLE_DEBUG_FLOW
986  fprintf(stderr, "\tEntering BeagleGPUImpl::setTransitionMatrices\n");
987 #endif
988 
989  int k = 0;
990  while (k < count) {
991  const double* inMatrixOffset = inMatrices + k*kStateCount*kStateCount*kCategoryCount;
992  Real* tmpRealMatrixOffset = hMatrixCache;
993  int lumpedMatricesCount = 0;
994  int matrixIndex = matrixIndices[k];
995 
996  do {
997  for (int l = 0; l < kCategoryCount; l++) {
998  Real* transposeOffset = tmpRealMatrixOffset;
999 
1000  for (int i = 0; i < kStateCount; i++) {
1001 // #ifdef DOUBLE_PRECISION
1002 // memcpy(tmpRealMatrixOffset, inMatrixOffset, sizeof(Real) * kStateCount);
1003 // #else
1004 // MEMCNV(tmpRealMatrixOffset, inMatrixOffset, kStateCount, Real);
1005 // #endif
1006  beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
1007  tmpRealMatrixOffset += kPaddedStateCount;
1008  inMatrixOffset += kStateCount;
1009  }
1010 
1011  transposeSquareMatrix(transposeOffset, kPaddedStateCount);
1012  tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
1013  }
1014 
1015  lumpedMatricesCount++;
1016  k++;
1017  } while ((k < count) && (matrixIndices[k] == matrixIndices[k-1] + 1) && (lumpedMatricesCount < BEAGLE_CACHED_MATRICES_COUNT));
1018 
1019  // Copy to GPU device
1020  gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
1021  sizeof(Real) * kMatrixSize * kCategoryCount * lumpedMatricesCount);
1022 
1023  }
1024 
1025 #ifdef BEAGLE_DEBUG_FLOW
1026  fprintf(stderr, "\tLeaving BeagleGPUImpl::setTransitionMatrices\n");
1027 #endif
1028 
1029  return BEAGLE_SUCCESS;
1030 }
1031 
1032 BEAGLE_GPU_TEMPLATE
1033 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updateTransitionMatrices(int eigenIndex,
1034  const int* probabilityIndices,
1035  const int* firstDerivativeIndices,
1036  const int* secondDerivativeIndices,
1037  const double* edgeLengths,
1038  int count) {
1039 #ifdef BEAGLE_DEBUG_FLOW
1040  fprintf(stderr,"\tEntering BeagleGPUImpl::updateTransitionMatrices\n");
1041 #endif
1042  if (count > 0) {
1043  // TODO: improve performance of calculation of derivatives
1044  int totalCount = 0;
1045 
1046  int indexOffset = kMatrixSize * kCategoryCount;
1047  int categoryOffset = kMatrixSize;
1048 
1049  #ifdef CUDA
1050 
1051  if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1052  for (int i = 0; i < count; i++) {
1053  for (int j = 0; j < kCategoryCount; j++) {
1054  hPtrQueue[totalCount] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1055  hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
1056  totalCount++;
1057  }
1058  }
1059 
1060  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount);
1061  gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount);
1062 
1063  // Set-up and call GPU kernel
1064  kernels->GetTransitionProbabilitiesSquare(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1065  dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1066  } else if (secondDerivativeIndices == NULL) {
1067 
1068  totalCount = count * kCategoryCount;
1069  int ptrIndex = 0;
1070  for (int i = 0; i < count; i++) {
1071  for (int j = 0; j < kCategoryCount; j++) {
1072  hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1073  hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
1074  hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
1075  hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
1076  ptrIndex++;
1077  }
1078  }
1079 
1080  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount * 2);
1081  gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount * 2);
1082 
1083  kernels->GetTransitionProbabilitiesSquareFirstDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1084  dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1085 
1086  } else {
1087  totalCount = count * kCategoryCount;
1088  int ptrIndex = 0;
1089  for (int i = 0; i < count; i++) {
1090  for (int j = 0; j < kCategoryCount; j++) {
1091  hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1092  hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
1093  hPtrQueue[ptrIndex + totalCount*2] = secondDerivativeIndices[i] * indexOffset + j * categoryOffset;
1094  hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
1095  hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
1096  ptrIndex++;
1097  }
1098  }
1099 
1100  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount * 3);
1101  gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount * 2);
1102 
1103  kernels->GetTransitionProbabilitiesSquareSecondDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1104  dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1105  }
1106 
1107 
1108  #else
1109  // TODO: update OpenCL implementation with derivs
1110 
1111  for (int i = 0; i < count; i++) {
1112  for (int j = 0; j < kCategoryCount; j++) {
1113  hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
1114  totalCount++;
1115  }
1116  }
1117 
1118  gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount);
1119 
1120  // Set-up and call GPU kernel
1121  for (int i = 0; i < count; i++) {
1122  kernels->GetTransitionProbabilitiesSquare(dMatrices[probabilityIndices[i]], dEvec[eigenIndex], dIevc[eigenIndex],
1123  dEigenValues[eigenIndex], dDistanceQueue, kCategoryCount,
1124  i * kCategoryCount);
1125  }
1126 
1127  #endif
1128 
1129  #ifdef BEAGLE_DEBUG_VALUES
1130  Real r = 0;
1131  for (int i = 0; i < 1; i++) {
1132  fprintf(stderr, "dMatrices[probabilityIndices[%d]] (hDQ = %1.5e, eL = %1.5e) =\n", i,hDistanceQueue[i], edgeLengths[i]);
1133  gpu->PrintfDeviceVector(dMatrices[probabilityIndices[i]], kMatrixSize * kCategoryCount, r);
1134  for(int j=0; j<kCategoryCount; j++)
1135  fprintf(stderr, " %1.5f",hCategoryRates[j]);
1136  fprintf(stderr,"\n");
1137  }
1138  #endif
1139 
1140  #ifdef BEAGLE_DEBUG_SYNCH
1141  gpu->Synchronize();
1142  #endif
1143  }
1144 #ifdef BEAGLE_DEBUG_FLOW
1145  fprintf(stderr, "\tLeaving BeagleGPUImpl::updateTransitionMatrices\n");
1146 #endif
1147 
1148  return BEAGLE_SUCCESS;
1149 }
1150 
1151 BEAGLE_GPU_TEMPLATE
1152 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updatePartials(const int* operations,
1153  int operationCount,
1154  int cumulativeScalingIndex) {
1155 
1156 #ifdef BEAGLE_DEBUG_FLOW
1157  fprintf(stderr, "\tEntering BeagleGPUImpl::updatePartials\n");
1158 #endif
1159 
1160  GPUPtr cumulativeScalingBuffer = 0;
1161  if (cumulativeScalingIndex != BEAGLE_OP_NONE)
1162  cumulativeScalingBuffer = dScalingFactors[cumulativeScalingIndex];
1163 
1164  // Serial version
1165  for (int op = 0; op < operationCount; op++) {
1166  const int parIndex = operations[op * 7];
1167  const int writeScalingIndex = operations[op * 7 + 1];
1168  const int readScalingIndex = operations[op * 7 + 2];
1169  const int child1Index = operations[op * 7 + 3];
1170  const int child1TransMatIndex = operations[op * 7 + 4];
1171  const int child2Index = operations[op * 7 + 5];
1172  const int child2TransMatIndex = operations[op * 7 + 6];
1173 
1174  GPUPtr matrices1 = dMatrices[child1TransMatIndex];
1175  GPUPtr matrices2 = dMatrices[child2TransMatIndex];
1176 
1177  GPUPtr partials1 = dPartials[child1Index];
1178  GPUPtr partials2 = dPartials[child2Index];
1179 
1180  GPUPtr partials3 = dPartials[parIndex];
1181 
1182  GPUPtr tipStates1 = dStates[child1Index];
1183  GPUPtr tipStates2 = dStates[child2Index];
1184 
1185  int rescale = BEAGLE_OP_NONE;
1186  GPUPtr scalingFactors = (GPUPtr)NULL;
1187 
1188  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1189  int sIndex = parIndex - kTipCount;
1190 
1191  if (tipStates1 == 0 && tipStates2 == 0) {
1192  rescale = 2;
1193  scalingFactors = dScalingFactors[sIndex];
1194  }
1195  } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1196  rescale = 1;
1197  scalingFactors = dScalingFactors[parIndex - kTipCount];
1198  } else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && writeScalingIndex >= 0) {
1199  rescale = 1;
1200  scalingFactors = dScalingFactors[writeScalingIndex];
1201  } else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && readScalingIndex >= 0) {
1202  rescale = 0;
1203  scalingFactors = dScalingFactors[readScalingIndex];
1204  }
1205 
1206 #ifdef BEAGLE_DEBUG_VALUES
1207  fprintf(stderr, "kPaddedPatternCount = %d\n", kPaddedPatternCount);
1208  fprintf(stderr, "kPatternCount = %d\n", kPatternCount);
1209  fprintf(stderr, "categoryCount = %d\n", kCategoryCount);
1210  fprintf(stderr, "partialSize = %d\n", kPartialsSize);
1211  fprintf(stderr, "writeIndex = %d, readIndex = %d, rescale = %d\n",writeScalingIndex,readScalingIndex,rescale);
1212  fprintf(stderr, "child1 = \n");
1213  Real r = 0;
1214  if (tipStates1)
1215  gpu->PrintfDeviceInt(tipStates1, kPaddedPatternCount);
1216  else
1217  gpu->PrintfDeviceVector(partials1, kPartialsSize, r);
1218  fprintf(stderr, "child2 = \n");
1219  if (tipStates2)
1220  gpu->PrintfDeviceInt(tipStates2, kPaddedPatternCount);
1221  else
1222  gpu->PrintfDeviceVector(partials2, kPartialsSize, r);
1223  fprintf(stderr, "node index = %d\n", parIndex);
1224 #endif
1225 
1226  if (tipStates1 != 0) {
1227  if (tipStates2 != 0 ) {
1228  kernels->StatesStatesPruningDynamicScaling(tipStates1, tipStates2, partials3,
1229  matrices1, matrices2, scalingFactors,
1230  cumulativeScalingBuffer,
1231  kPaddedPatternCount, kCategoryCount,
1232  rescale);
1233  } else {
1234  kernels->StatesPartialsPruningDynamicScaling(tipStates1, partials2, partials3,
1235  matrices1, matrices2, scalingFactors,
1236  cumulativeScalingBuffer,
1237  kPaddedPatternCount, kCategoryCount,
1238  rescale);
1239  }
1240  } else {
1241  if (tipStates2 != 0) {
1242  kernels->StatesPartialsPruningDynamicScaling(tipStates2, partials1, partials3,
1243  matrices2, matrices1, scalingFactors,
1244  cumulativeScalingBuffer,
1245  kPaddedPatternCount, kCategoryCount,
1246  rescale);
1247  } else {
1248  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1249  kernels->PartialsPartialsPruningDynamicCheckScaling(partials1, partials2, partials3,
1250  matrices1, matrices2, writeScalingIndex, readScalingIndex,
1251  cumulativeScalingIndex, dScalingFactors, dScalingFactorsMaster,
1252  kPaddedPatternCount, kCategoryCount,
1253  rescale, hRescalingTrigger, dRescalingTrigger, sizeof(Real));
1254  } else {
1255  kernels->PartialsPartialsPruningDynamicScaling(partials1, partials2, partials3,
1256  matrices1, matrices2, scalingFactors,
1257  cumulativeScalingBuffer,
1258  kPaddedPatternCount, kCategoryCount,
1259  rescale);
1260  }
1261  }
1262  }
1263 
1264  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1265  int parScalingIndex = parIndex - kTipCount;
1266  int child1ScalingIndex = child1Index - kTipCount;
1267  int child2ScalingIndex = child2Index - kTipCount;
1268  if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
1269  int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
1270  accumulateScaleFactors(scalingIndices, 2, parScalingIndex);
1271  } else if (child1ScalingIndex >= 0) {
1272  int scalingIndices[1] = {child1ScalingIndex};
1273  accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
1274  } else if (child2ScalingIndex >= 0) {
1275  int scalingIndices[1] = {child2ScalingIndex};
1276  accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
1277  }
1278  }
1279 
1280 #ifdef BEAGLE_DEBUG_VALUES
1281  if (rescale > -1) {
1282  fprintf(stderr,"scalars = ");
1283  gpu->PrintfDeviceVector(scalingFactors,kPaddedPatternCount, r);
1284  }
1285  fprintf(stderr, "parent = \n");
1286  int signal = 0;
1287  if (writeScalingIndex == -1)
1288  gpu->PrintfDeviceVector(partials3, kPartialsSize, r);
1289  else
1290  gpu->PrintfDeviceVector(partials3, kPartialsSize, 1.0, &signal, r);
1291 #endif
1292  }
1293 
1294 #ifdef BEAGLE_DEBUG_SYNCH
1295  gpu->Synchronize();
1296 #endif
1297 
1298 #ifdef BEAGLE_DEBUG_FLOW
1299  fprintf(stderr, "\tLeaving BeagleGPUImpl::updatePartials\n");
1300 #endif
1301 
1302  return BEAGLE_SUCCESS;
1303 }
1304 
1305 BEAGLE_GPU_TEMPLATE
1306 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::waitForPartials(const int* /*destinationPartials*/,
1307  int /*destinationPartialsCount*/) {
1308 #ifdef BEAGLE_DEBUG_FLOW
1309  fprintf(stderr, "\tEntering BeagleGPUImpl::waitForPartials\n");
1310 #endif
1311 
1312 #ifdef BEAGLE_DEBUG_FLOW
1313  fprintf(stderr, "\tLeaving BeagleGPUImpl::waitForPartials\n");
1314 #endif
1315 
1316  return BEAGLE_SUCCESS;
1317 }
1318 
1319 BEAGLE_GPU_TEMPLATE
1320 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::accumulateScaleFactors(const int* scalingIndices,
1321  int count,
1322  int cumulativeScalingIndex) {
1323 #ifdef BEAGLE_DEBUG_FLOW
1324  fprintf(stderr, "\tEntering BeagleGPUImpl::accumulateScaleFactors\n");
1325 #endif
1326 
1327  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1328  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1329  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1330  gpu->Synchronize();
1331  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1332  }
1333  }
1334 
1335  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1336 
1337  for(int n = 0; n < count; n++) {
1338  int sIndex = scalingIndices[n] - kTipCount;
1339  hPtrQueue[n] = sIndex;
1340  }
1341 
1342  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1343 
1344  kernels->AccumulateFactorsAutoScaling(dScalingFactors[0], dPtrQueue, dAccumulatedScalingFactors, count, kPaddedPatternCount, kScaleBufferSize);
1345 
1346  } else {
1347  for(int n = 0; n < count; n++)
1348  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1349  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1350 
1351 
1352  #ifdef CUDA
1353  // Compute scaling factors at the root
1354  kernels->AccumulateFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex], count, kPaddedPatternCount);
1355  #else // OpenCL
1356  for (int i = 0; i < count; i++) {
1357  kernels->AccumulateFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1358  1, kPaddedPatternCount);
1359  }
1360  #endif
1361  }
1362 
1363 #ifdef BEAGLE_DEBUG_SYNCH
1364  gpu->Synchronize();
1365 #endif
1366 
1367 #ifdef BEAGLE_DEBUG_VALUES
1368  Real r = 0;
1369  fprintf(stderr, "scaling factors = ");
1370  gpu->PrintfDeviceVector(dScalingFactors[cumulativeScalingIndex], kPaddedPatternCount, r);
1371 #endif
1372 
1373 #ifdef BEAGLE_DEBUG_FLOW
1374  fprintf(stderr, "\tLeaving BeagleGPUImpl::accumulateScaleFactors\n");
1375 #endif
1376 
1377  return BEAGLE_SUCCESS;
1378 }
1379 
1380 BEAGLE_GPU_TEMPLATE
1381 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::removeScaleFactors(const int* scalingIndices,
1382  int count,
1383  int cumulativeScalingIndex) {
1384 
1385 #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue,