30 #include "libhmsbeagle/config.h"
39 #include "libhmsbeagle/beagle.h"
40 #include "libhmsbeagle/GPU/GPUImplDefs.h"
41 #include "libhmsbeagle/GPU/BeagleGPUImpl.h"
42 #include "libhmsbeagle/GPU/GPUImplHelper.h"
43 #include "libhmsbeagle/GPU/KernelLauncher.h"
44 #include "libhmsbeagle/GPU/GPUInterface.h"
45 #include "libhmsbeagle/GPU/Precision.h"
51 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::BeagleGPUImpl() {
56 dIntegrationTmp = (GPUPtr)NULL;
57 dOutFirstDeriv = (GPUPtr)NULL;
58 dOutSecondDeriv = (GPUPtr)NULL;
59 dPartialsTmp = (GPUPtr)NULL;
60 dFirstDerivTmp = (GPUPtr)NULL;
61 dSecondDerivTmp = (GPUPtr)NULL;
63 dSumLogLikelihood = (GPUPtr)NULL;
64 dSumFirstDeriv = (GPUPtr)NULL;
65 dSumSecondDeriv = (GPUPtr)NULL;
67 dPatternWeights = (GPUPtr)NULL;
69 dBranchLengths = (GPUPtr)NULL;
71 dDistanceQueue = (GPUPtr)NULL;
73 dPtrQueue = (GPUPtr)NULL;
75 dMaxScalingFactors = (GPUPtr)NULL;
76 dIndexMaxScalingFactors = (GPUPtr)NULL;
85 dScalingFactors = NULL;
92 dCompactBuffers = NULL;
93 dTipPartialsBuffers = NULL;
97 hCategoryRates = NULL;
99 hPatternWeightsCache = NULL;
101 hDistanceQueue = NULL;
103 hWeightsCache = NULL;
104 hFrequenciesCache = NULL;
105 hLogLikelihoodsCache = NULL;
106 hPartialsCache = NULL;
110 hRescalingTrigger = NULL;
111 dRescalingTrigger = (GPUPtr)NULL;
112 dScalingFactorsMaster = NULL;
116 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::~BeagleGPUImpl() {
119 for (
int i=0; i < kEigenDecompCount; i++) {
120 gpu->FreeMemory(dEigenValues[i]);
121 gpu->FreeMemory(dEvec[i]);
122 gpu->FreeMemory(dIevc[i]);
123 gpu->FreeMemory(dWeights[i]);
124 gpu->FreeMemory(dFrequencies[i]);
127 gpu->FreeMemory(dMatrices[0]);
129 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
130 gpu->FreePinnedHostMemory(hRescalingTrigger);
131 for (
int i = 0; i < kScaleBufferCount; i++) {
132 if (dScalingFactorsMaster[i] != 0)
133 gpu->FreeMemory(dScalingFactorsMaster[i]);
135 free(dScalingFactorsMaster);
137 if (kScaleBufferCount > 0)
138 gpu->FreeMemory(dScalingFactors[0]);
141 for (
int i = 0; i < kBufferCount; i++) {
143 if (i < kCompactBufferCount)
144 gpu->FreeMemory(dCompactBuffers[i]);
145 if (i < kTipPartialsBufferCount)
146 gpu->FreeMemory(dTipPartialsBuffers[i]);
148 gpu->FreeMemory(dPartials[i]);
152 gpu->FreeMemory(dIntegrationTmp);
153 gpu->FreeMemory(dOutFirstDeriv);
154 gpu->FreeMemory(dOutSecondDeriv);
155 gpu->FreeMemory(dPartialsTmp);
156 gpu->FreeMemory(dFirstDerivTmp);
157 gpu->FreeMemory(dSecondDerivTmp);
159 gpu->FreeMemory(dSumLogLikelihood);
160 gpu->FreeMemory(dSumFirstDeriv);
161 gpu->FreeMemory(dSumSecondDeriv);
163 gpu->FreeMemory(dPatternWeights);
165 gpu->FreeMemory(dBranchLengths);
167 gpu->FreeMemory(dDistanceQueue);
169 gpu->FreeMemory(dPtrQueue);
171 gpu->FreeMemory(dMaxScalingFactors);
172 gpu->FreeMemory(dIndexMaxScalingFactors);
174 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
175 gpu->FreeMemory(dAccumulatedScalingFactors);
185 free(dScalingFactors);
192 free(dCompactBuffers);
193 free(dTipPartialsBuffers);
195 gpu->FreeHostMemory(hPtrQueue);
197 gpu->FreeHostMemory(hCategoryRates);
199 gpu->FreeHostMemory(hPatternWeightsCache);
201 gpu->FreeHostMemory(hDistanceQueue);
203 gpu->FreeHostMemory(hWeightsCache);
204 gpu->FreeHostMemory(hFrequenciesCache);
205 gpu->FreeHostMemory(hPartialsCache);
206 gpu->FreeHostMemory(hStatesCache);
208 gpu->FreeHostMemory(hLogLikelihoodsCache);
209 gpu->FreeHostMemory(hMatrixCache);
221 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::createInstance(
int tipCount,
222 int partialsBufferCount,
223 int compactBufferCount,
226 int eigenDecompositionCount,
229 int scaleBufferCount,
231 long preferenceFlags,
232 long requirementFlags) {
234 #ifdef BEAGLE_DEBUG_FLOW
235 fprintf(stderr,
"\tEntering BeagleGPUImpl::createInstance\n");
240 kTipCount = tipCount;
241 kPartialsBufferCount = partialsBufferCount;
242 kCompactBufferCount = compactBufferCount;
243 kStateCount = stateCount;
244 kPatternCount = patternCount;
245 kEigenDecompCount = eigenDecompositionCount;
246 kMatrixCount = matrixCount;
247 kCategoryCount = categoryCount;
248 kScaleBufferCount = scaleBufferCount;
250 resourceNumber = iResourceNumber;
252 kTipPartialsBufferCount = kTipCount - kCompactBufferCount;
253 kBufferCount = kPartialsBufferCount + kCompactBufferCount;
255 kInternalPartialsBufferCount = kBufferCount - kTipCount;
257 if (kStateCount <= 4)
258 kPaddedStateCount = 4;
259 else if (kStateCount <= 16)
260 kPaddedStateCount = 16;
261 else if (kStateCount <= 32)
262 kPaddedStateCount = 32;
263 else if (kStateCount <= 48)
264 kPaddedStateCount = 48;
265 else if (kStateCount <= 64)
266 kPaddedStateCount = 64;
267 else if (kStateCount <= 80)
268 kPaddedStateCount = 80;
269 else if (kStateCount <= 128)
270 kPaddedStateCount = 128;
271 else if (kStateCount <= 192)
272 kPaddedStateCount = 192;
274 kPaddedStateCount = kStateCount + kStateCount % 16;
277 int paddedPatterns = 0;
278 if (kPaddedStateCount == 4 && kPatternCount % 4 != 0)
279 paddedPatterns = 4 - kPatternCount % 4;
283 kPaddedPatternCount = kPatternCount + paddedPatterns;
285 int resultPaddedPatterns = 0;
286 int patternBlockSizeFour = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? PATTERN_BLOCK_SIZE_DP_4 : PATTERN_BLOCK_SIZE_SP_4);
287 if (kPaddedStateCount == 4 && kPaddedPatternCount % patternBlockSizeFour != 0)
288 resultPaddedPatterns = patternBlockSizeFour - kPaddedPatternCount % patternBlockSizeFour;
290 kScaleBufferSize = kPaddedPatternCount;
294 if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) {
295 kFlags |= BEAGLE_FLAG_SCALING_AUTO;
296 kFlags |= BEAGLE_FLAG_SCALERS_LOG;
297 kScaleBufferCount = kInternalPartialsBufferCount;
298 kScaleBufferSize *= kCategoryCount;
299 }
else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
300 kFlags |= BEAGLE_FLAG_SCALING_ALWAYS;
301 kFlags |= BEAGLE_FLAG_SCALERS_LOG;
302 kScaleBufferCount = kInternalPartialsBufferCount + 1;
303 }
else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
304 kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC;
305 kFlags |= BEAGLE_FLAG_SCALERS_RAW;
306 }
else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) {
307 kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
308 kFlags |= BEAGLE_FLAG_SCALERS_LOG;
310 kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
311 kFlags |= BEAGLE_FLAG_SCALERS_RAW;
314 if (preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX || requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
315 kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX;
317 kFlags |= BEAGLE_FLAG_EIGEN_REAL;
320 if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED)
321 kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED;
323 kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD;
326 modifyFlagsForPrecision(&kFlags, r);
328 int sumSitesBlockSize = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? SUM_SITES_BLOCK_SIZE_DP : SUM_SITES_BLOCK_SIZE_SP);
329 kSumSitesBlockCount = kPatternCount / sumSitesBlockSize;
330 if (kPatternCount % sumSitesBlockSize != 0)
331 kSumSitesBlockCount += 1;
333 kPartialsSize = kPaddedPatternCount * kPaddedStateCount * kCategoryCount;
334 kMatrixSize = kPaddedStateCount * kPaddedStateCount;
336 if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
337 kEigenValuesSize = 2 * kPaddedStateCount;
339 kEigenValuesSize = kPaddedStateCount;
341 kLastCompactBufferIndex = -1;
342 kLastTipPartialsBufferIndex = -1;
349 numDevices = gpu->GetDeviceCount();
350 if (numDevices == 0) {
351 fprintf(stderr,
"Error: No GPU devices\n");
352 return BEAGLE_ERROR_NO_RESOURCE;
354 if (resourceNumber > numDevices) {
355 fprintf(stderr,
"Error: Trying to initialize device # %d (which does not exist)\n",resourceNumber);
356 return BEAGLE_ERROR_NO_RESOURCE;
360 gpu->SetDevice(resourceNumber-1,kPaddedStateCount,kCategoryCount,kPaddedPatternCount,kFlags);
362 int ptrQueueLength = kMatrixCount * kCategoryCount * 3;
363 if (kPartialsBufferCount > ptrQueueLength)
364 ptrQueueLength = kPartialsBufferCount;
366 unsigned int neededMemory =
sizeof(Real) * (kMatrixSize * kEigenDecompCount +
367 kMatrixSize * kEigenDecompCount +
368 kEigenValuesSize * kEigenDecompCount +
369 kCategoryCount * kPartialsBufferCount +
370 kPaddedStateCount * kPartialsBufferCount +
371 kPaddedPatternCount +
372 kPaddedPatternCount +
373 kPaddedPatternCount +
377 kScaleBufferCount * kPaddedPatternCount +
378 kPartialsBufferCount * kPartialsSize +
379 kMatrixCount * kMatrixSize * kCategoryCount +
381 kMatrixCount * kCategoryCount * 2) +
382 sizeof(int) * kCompactBufferCount * kPaddedPatternCount +
383 sizeof(GPUPtr) * ptrQueueLength;
386 unsigned int availableMem = gpu->GetAvailableMemory();
388 #ifdef BEAGLE_DEBUG_VALUES
389 fprintf(stderr,
" needed memory: %d\n", neededMemory);
390 fprintf(stderr,
" available memory: %d\n", availableMem);
393 if (availableMem < neededMemory)
394 return BEAGLE_ERROR_OUT_OF_MEMORY;
399 hWeightsCache = (Real*) gpu->CallocHost(kCategoryCount * kPartialsBufferCount,
sizeof(Real));
400 hFrequenciesCache = (Real*) gpu->CallocHost(kPaddedStateCount * kPartialsBufferCount,
sizeof(Real));
401 hPartialsCache = (Real*) gpu->CallocHost(kPartialsSize,
sizeof(Real));
402 hStatesCache = (
int*) gpu->CallocHost(kPaddedPatternCount,
sizeof(
int));
404 int hMatrixCacheSize = kMatrixSize * kCategoryCount * BEAGLE_CACHED_MATRICES_COUNT;
405 if ((2 * kMatrixSize + kEigenValuesSize) > hMatrixCacheSize)
406 hMatrixCacheSize = 2 * kMatrixSize + kEigenValuesSize;
408 hLogLikelihoodsCache = (Real*) gpu->MallocHost(kPatternCount *
sizeof(Real));
409 hMatrixCache = (Real*) gpu->CallocHost(hMatrixCacheSize,
sizeof(Real));
411 dEvec = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
412 dIevc = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
413 dEigenValues = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
414 dWeights = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
415 dFrequencies = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
417 dMatrices = (GPUPtr*) malloc(
sizeof(GPUPtr) * kMatrixCount);
418 dMatrices[0] = gpu->AllocateMemory(kMatrixCount * kMatrixSize * kCategoryCount *
sizeof(Real));
420 for (
int i = 1; i < kMatrixCount; i++) {
421 dMatrices[i] = dMatrices[i-1] + kMatrixSize * kCategoryCount *
sizeof(Real);
424 if (kScaleBufferCount > 0) {
425 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
426 dScalingFactors = (GPUPtr*) malloc(
sizeof(GPUPtr) * kScaleBufferCount);
427 dScalingFactors[0] = gpu->AllocateMemory(
sizeof(
signed char) * kScaleBufferSize * kScaleBufferCount);
428 for (
int i=1; i < kScaleBufferCount; i++)
429 dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize *
sizeof(
signed char);
430 }
else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
431 dScalingFactors = (GPUPtr*) calloc(
sizeof(GPUPtr), kScaleBufferCount);
432 dScalingFactorsMaster = (GPUPtr*) calloc(
sizeof(GPUPtr), kScaleBufferCount);
433 hRescalingTrigger = (
int*) gpu->AllocatePinnedHostMemory(
sizeof(
int),
false,
true);
434 dRescalingTrigger = gpu->GetDevicePointer((
void*) hRescalingTrigger);
436 dScalingFactors = (GPUPtr*) malloc(
sizeof(GPUPtr) * kScaleBufferCount);
437 dScalingFactors[0] = gpu->AllocateMemory(kScaleBufferSize * kScaleBufferCount *
sizeof(Real));
438 for (
int i=1; i < kScaleBufferCount; i++) {
439 dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize *
sizeof(Real);
444 for(
int i=0; i<kEigenDecompCount; i++) {
445 dEvec[i] = gpu->AllocateMemory(kMatrixSize *
sizeof(Real));
446 dIevc[i] = gpu->AllocateMemory(kMatrixSize *
sizeof(Real));
447 dEigenValues[i] = gpu->AllocateMemory(kEigenValuesSize *
sizeof(Real));
448 dWeights[i] = gpu->AllocateMemory(kCategoryCount *
sizeof(Real));
449 dFrequencies[i] = gpu->AllocateMemory(kPaddedStateCount *
sizeof(Real));
453 dIntegrationTmp = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) *
sizeof(Real));
454 dOutFirstDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) *
sizeof(Real));
455 dOutSecondDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) *
sizeof(Real));
457 dPatternWeights = gpu->AllocateMemory(kPatternCount *
sizeof(Real));
459 dSumLogLikelihood = gpu->AllocateMemory(kSumSitesBlockCount *
sizeof(Real));
460 dSumFirstDeriv = gpu->AllocateMemory(kSumSitesBlockCount *
sizeof(Real));
461 dSumSecondDeriv = gpu->AllocateMemory(kSumSitesBlockCount *
sizeof(Real));
463 dPartialsTmp = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
464 dFirstDerivTmp = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
465 dSecondDerivTmp = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
468 dPartials = (GPUPtr*) calloc(
sizeof(GPUPtr), kBufferCount);
471 dStates = (GPUPtr*) calloc(
sizeof(GPUPtr), kBufferCount);
473 dCompactBuffers = (GPUPtr*) malloc(
sizeof(GPUPtr) * kCompactBufferCount);
474 dTipPartialsBuffers = (GPUPtr*) malloc(
sizeof(GPUPtr) * kTipPartialsBufferCount);
476 for (
int i = 0; i < kBufferCount; i++) {
478 if (i < kCompactBufferCount)
479 dCompactBuffers[i] = gpu->AllocateMemory(kPaddedPatternCount *
sizeof(
int));
480 if (i < kTipPartialsBufferCount)
481 dTipPartialsBuffers[i] = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
483 dPartials[i] = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
487 kLastCompactBufferIndex = kCompactBufferCount - 1;
488 kLastTipPartialsBufferIndex = kTipPartialsBufferCount - 1;
491 dBranchLengths = gpu->AllocateMemory(kBufferCount *
sizeof(Real));
493 dDistanceQueue = gpu->AllocateMemory(kMatrixCount * kCategoryCount * 2 *
sizeof(Real));
494 hDistanceQueue = (Real*) gpu->MallocHost(
sizeof(Real) * kMatrixCount * kCategoryCount * 2);
495 checkHostMemory(hDistanceQueue);
497 dPtrQueue = gpu->AllocateMemory(
sizeof(
unsigned int) * ptrQueueLength);
498 hPtrQueue = (
unsigned int*) gpu->MallocHost(
sizeof(
unsigned int) * ptrQueueLength);
499 checkHostMemory(hPtrQueue);
501 hCategoryRates = (
double*) gpu->MallocHost(
sizeof(
double) * kCategoryCount);
502 checkHostMemory(hCategoryRates);
504 hPatternWeightsCache = (Real*) gpu->MallocHost(
sizeof(
double) * kPatternCount);
505 checkHostMemory(hPatternWeightsCache);
507 dMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount *
sizeof(Real));
508 dIndexMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount *
sizeof(
int));
510 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
511 dAccumulatedScalingFactors = gpu->AllocateMemory(
sizeof(
int) * kScaleBufferSize);
514 #ifdef BEAGLE_DEBUG_FLOW
515 fprintf(stderr,
"\tLeaving BeagleGPUImpl::createInstance\n");
520 #ifdef BEAGLE_DEBUG_VALUES
522 int usedMemory = availableMem - gpu->GetAvailableMemory();
523 fprintf(stderr,
"actual used memory: %d\n", usedMemory);
524 fprintf(stderr,
" difference: %d\n\n", usedMemory-neededMemory);
527 return BEAGLE_SUCCESS;
531 char* BeagleGPUImpl<double>::getInstanceName() {
532 return (
char*)
"CUDA-Double";
536 char* BeagleGPUImpl<float>::getInstanceName() {
537 return (
char*)
"CUDA-Single";
542 if (returnInfo != NULL) {
544 returnInfo->
flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
545 BEAGLE_FLAG_THREADING_NONE |
546 BEAGLE_FLAG_VECTOR_NONE |
547 BEAGLE_FLAG_PROCESSOR_GPU;
549 modifyFlagsForPrecision(&(returnInfo->
flags), r);
551 returnInfo->
flags |= kFlags;
553 returnInfo->
implName = getInstanceName();
555 return BEAGLE_SUCCESS;
559 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipStates(
int tipIndex,
560 const int* inStates) {
562 #ifdef BEAGLE_DEBUG_FLOW
563 fprintf(stderr,
"\tEntering BeagleGPUImpl::setTipStates\n");
566 if (tipIndex < 0 || tipIndex >= kTipCount)
567 return BEAGLE_ERROR_OUT_OF_RANGE;
569 for(
int i = 0; i < kPatternCount; i++)
570 hStatesCache[i] = (inStates[i] < kStateCount ? inStates[i] : kPaddedStateCount);
573 for(
int i = kPatternCount; i < kPaddedPatternCount; i++)
574 hStatesCache[i] = kPaddedStateCount;
576 if (dStates[tipIndex] == 0) {
577 assert(kLastCompactBufferIndex >= 0 && kLastCompactBufferIndex < kCompactBufferCount);
578 dStates[tipIndex] = dCompactBuffers[kLastCompactBufferIndex--];
581 gpu->MemcpyHostToDevice(dStates[tipIndex], hStatesCache,
sizeof(
int) * kPaddedPatternCount);
583 #ifdef BEAGLE_DEBUG_FLOW
584 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setTipStates\n");
587 return BEAGLE_SUCCESS;
591 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipPartials(
int tipIndex,
592 const double* inPartials) {
593 #ifdef BEAGLE_DEBUG_FLOW
594 fprintf(stderr,
"\tEntering BeagleGPUImpl::setTipPartials\n");
597 if (tipIndex < 0 || tipIndex >= kTipCount)
598 return BEAGLE_ERROR_OUT_OF_RANGE;
600 const double* inPartialsOffset = inPartials;
601 Real* tmpRealPartialsOffset = hPartialsCache;
602 for (
int i = 0; i < kPatternCount; i++) {
608 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
609 tmpRealPartialsOffset += kPaddedStateCount;
610 inPartialsOffset += kStateCount;
613 int partialsLength = kPaddedPatternCount * kPaddedStateCount;
614 for (
int i = 1; i < kCategoryCount; i++) {
615 memcpy(hPartialsCache + i * partialsLength, hPartialsCache, partialsLength *
sizeof(Real));
618 if (tipIndex < kTipCount) {
619 if (dPartials[tipIndex] == 0) {
620 assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
621 kTipPartialsBufferCount);
622 dPartials[tipIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
626 gpu->MemcpyHostToDevice(dPartials[tipIndex], hPartialsCache,
sizeof(Real) * kPartialsSize);
628 #ifdef BEAGLE_DEBUG_FLOW
629 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setTipPartials\n");
632 return BEAGLE_SUCCESS;
636 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPartials(
int bufferIndex,
637 const double* inPartials) {
638 #ifdef BEAGLE_DEBUG_FLOW
639 fprintf(stderr,
"\tEntering BeagleGPUImpl::setPartials\n");
642 if (bufferIndex < 0 || bufferIndex >= kPartialsBufferCount)
643 return BEAGLE_ERROR_OUT_OF_RANGE;
645 const double* inPartialsOffset = inPartials;
646 Real* tmpRealPartialsOffset = hPartialsCache;
647 for (
int l = 0; l < kCategoryCount; l++) {
648 for (
int i = 0; i < kPatternCount; i++) {
654 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
655 tmpRealPartialsOffset += kPaddedStateCount;
656 inPartialsOffset += kStateCount;
658 tmpRealPartialsOffset += kPaddedStateCount * (kPaddedPatternCount - kPatternCount);
661 if (bufferIndex < kTipCount) {
662 if (dPartials[bufferIndex] == 0) {
663 assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
664 kTipPartialsBufferCount);
665 dPartials[bufferIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
669 gpu->MemcpyHostToDevice(dPartials[bufferIndex], hPartialsCache,
sizeof(Real) * kPartialsSize);
671 #ifdef BEAGLE_DEBUG_FLOW
672 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setPartials\n");
675 return BEAGLE_SUCCESS;
679 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getPartials(
int bufferIndex,
681 double* outPartials) {
682 #ifdef BEAGLE_DEBUG_FLOW
683 fprintf(stderr,
"\tEntering BeagleGPUImpl::getPartials\n");
686 gpu->MemcpyDeviceToHost(hPartialsCache, dPartials[bufferIndex],
sizeof(Real) * kPartialsSize);
688 double* outPartialsOffset = outPartials;
689 Real* tmpRealPartialsOffset = hPartialsCache;
691 for (
int i = 0; i < kPatternCount; i++) {
697 beagleMemCpy(outPartialsOffset, tmpRealPartialsOffset, kStateCount);
698 tmpRealPartialsOffset += kPaddedStateCount;
699 outPartialsOffset += kStateCount;
702 #ifdef BEAGLE_DEBUG_FLOW
703 fprintf(stderr,
"\tLeaving BeagleGPUImpl::getPartials\n");
706 return BEAGLE_SUCCESS;
710 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setEigenDecomposition(
int eigenIndex,
711 const double* inEigenVectors,
712 const double* inInverseEigenVectors,
713 const double* inEigenValues) {
715 #ifdef BEAGLE_DEBUG_FLOW
716 fprintf(stderr,
"\tEntering BeagleGPUImpl::setEigenDecomposition\n");
722 Real* Ievc, * tmpIevc, * Evec, * tmpEvec, * Eval;
724 tmpIevc = Ievc = (Real*) hMatrixCache;
725 tmpEvec = Evec = Ievc + kMatrixSize;
726 Eval = Evec + kMatrixSize;
728 for (
int i = 0; i < kStateCount; i++) {
736 beagleMemCpy(tmpIevc, inInverseEigenVectors + i * kStateCount, kStateCount);
737 beagleMemCpy(tmpEvec, inEigenVectors + i * kStateCount, kStateCount);
738 tmpIevc += kPaddedStateCount;
739 tmpEvec += kPaddedStateCount;
744 if (kFlags & BEAGLE_FLAG_INVEVEC_STANDARD)
745 transposeSquareMatrix(Ievc, kPaddedStateCount);
746 transposeSquareMatrix(Evec, kPaddedStateCount);
757 beagleMemCpy(Eval, inEigenValues, kStateCount);
758 if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
759 beagleMemCpy(Eval + kPaddedStateCount, inEigenValues + kStateCount, kStateCount);
762 #ifdef BEAGLE_DEBUG_VALUES
771 fprintf(stderr,
"Eval =\n");
772 printfVector(Eval, kEigenValuesSize);
773 fprintf(stderr,
"Evec =\n");
774 printfVector(Evec, kMatrixSize);
775 fprintf(stderr,
"Ievc =\n");
776 printfVector(Ievc, kPaddedStateCount * kPaddedStateCount);
781 gpu->MemcpyHostToDevice(dIevc[eigenIndex], Ievc,
sizeof(Real) * kMatrixSize);
782 gpu->MemcpyHostToDevice(dEvec[eigenIndex], Evec,
sizeof(Real) * kMatrixSize);
783 gpu->MemcpyHostToDevice(dEigenValues[eigenIndex], Eval,
sizeof(Real) * kEigenValuesSize);
785 #ifdef BEAGLE_DEBUG_VALUES
787 fprintf(stderr,
"dEigenValues =\n");
788 gpu->PrintfDeviceVector(dEigenValues[eigenIndex], kEigenValuesSize, r);
789 fprintf(stderr,
"dEvec =\n");
790 gpu->PrintfDeviceVector(dEvec[eigenIndex], kMatrixSize, r);
791 fprintf(stderr,
"dIevc =\n");
792 gpu->PrintfDeviceVector(dIevc[eigenIndex], kPaddedStateCount * kPaddedStateCount, r);
795 #ifdef BEAGLE_DEBUG_FLOW
796 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setEigenDecomposition\n");
799 return BEAGLE_SUCCESS;
803 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setStateFrequencies(
int stateFrequenciesIndex,
804 const double* inStateFrequencies) {
805 #ifdef BEAGLE_DEBUG_FLOW
806 fprintf(stderr,
"\tEntering BeagleGPUImpl::setStateFrequencies\n");
809 if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount)
810 return BEAGLE_ERROR_OUT_OF_RANGE;
817 beagleMemCpy(hFrequenciesCache, inStateFrequencies, kStateCount);
819 gpu->MemcpyHostToDevice(dFrequencies[stateFrequenciesIndex], hFrequenciesCache,
820 sizeof(Real) * kPaddedStateCount);
822 #ifdef BEAGLE_DEBUG_FLOW
823 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setStateFrequencies\n");
826 return BEAGLE_SUCCESS;
830 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryWeights(
int categoryWeightsIndex,
831 const double* inCategoryWeights) {
832 #ifdef BEAGLE_DEBUG_FLOW
833 fprintf(stderr,
"\tEntering BeagleGPUImpl::setCategoryWeights\n");
836 if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount)
837 return BEAGLE_ERROR_OUT_OF_RANGE;
845 const Real* tmpWeights = beagleCastIfNecessary(inCategoryWeights, hWeightsCache,
848 gpu->MemcpyHostToDevice(dWeights[categoryWeightsIndex], tmpWeights,
849 sizeof(Real) * kCategoryCount);
851 #ifdef BEAGLE_DEBUG_FLOW
852 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setCategoryWeights\n");
855 return BEAGLE_SUCCESS;
859 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryRates(
const double* inCategoryRates) {
861 #ifdef BEAGLE_DEBUG_FLOW
862 fprintf(stderr,
"\tEntering BeagleGPUImpl::updateCategoryRates\n");
865 const double* categoryRates = inCategoryRates;
868 memcpy(hCategoryRates, categoryRates,
sizeof(
double) * kCategoryCount);
870 #ifdef BEAGLE_DEBUG_FLOW
871 fprintf(stderr,
"\tLeaving BeagleGPUImpl::updateCategoryRates\n");
874 return BEAGLE_SUCCESS;
878 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPatternWeights(
const double* inPatternWeights) {
880 #ifdef BEAGLE_DEBUG_FLOW
881 fprintf(stderr,
"\tEntering BeagleGPUImpl::setPatternWeights\n");
890 const Real* tmpWeights = beagleCastIfNecessary(inPatternWeights, hPatternWeightsCache, kPatternCount);
892 gpu->MemcpyHostToDevice(dPatternWeights, tmpWeights,
893 sizeof(Real) * kPatternCount);
895 #ifdef BEAGLE_DEBUG_FLOW
896 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setPatternWeights\n");
899 return BEAGLE_SUCCESS;
903 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getTransitionMatrix(
int matrixIndex,
905 #ifdef BEAGLE_DEBUG_FLOW
906 fprintf(stderr,
"\tEntering BeagleGPUImpl::getTransitionMatrix\n");
909 gpu->MemcpyDeviceToHost(hMatrixCache, dMatrices[matrixIndex],
sizeof(Real) * kMatrixSize * kCategoryCount);
911 double* outMatrixOffset = outMatrix;
912 Real* tmpRealMatrixOffset = hMatrixCache;
914 for (
int l = 0; l < kCategoryCount; l++) {
916 transposeSquareMatrix(tmpRealMatrixOffset, kPaddedStateCount);
918 for (
int i = 0; i < kStateCount; i++) {
924 beagleMemCpy(outMatrixOffset, tmpRealMatrixOffset, kStateCount);
925 tmpRealMatrixOffset += kPaddedStateCount;
926 outMatrixOffset += kStateCount;
928 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
931 #ifdef BEAGLE_DEBUG_FLOW
932 fprintf(stderr,
"\tLeaving BeagleGPUImpl::getTransitionMatrix\n");
935 return BEAGLE_SUCCESS;
939 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrix(
int matrixIndex,
940 const double* inMatrix,
941 double paddedValue) {
943 #ifdef BEAGLE_DEBUG_FLOW
944 fprintf(stderr,
"\tEntering BeagleGPUImpl::setTransitionMatrix\n");
947 const double* inMatrixOffset = inMatrix;
948 Real* tmpRealMatrixOffset = hMatrixCache;
950 for (
int l = 0; l < kCategoryCount; l++) {
951 Real* transposeOffset = tmpRealMatrixOffset;
953 for (
int i = 0; i < kStateCount; i++) {
959 beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
960 tmpRealMatrixOffset += kPaddedStateCount;
961 inMatrixOffset += kStateCount;
964 transposeSquareMatrix(transposeOffset, kPaddedStateCount);
965 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
969 gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
970 sizeof(Real) * kMatrixSize * kCategoryCount);
972 #ifdef BEAGLE_DEBUG_FLOW
973 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setTransitionMatrix\n");
976 return BEAGLE_SUCCESS;
980 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrices(
const int* matrixIndices,
981 const double* inMatrices,
982 const double* paddedValues,
985 #ifdef BEAGLE_DEBUG_FLOW
986 fprintf(stderr,
"\tEntering BeagleGPUImpl::setTransitionMatrices\n");
991 const double* inMatrixOffset = inMatrices + k*kStateCount*kStateCount*kCategoryCount;
992 Real* tmpRealMatrixOffset = hMatrixCache;
993 int lumpedMatricesCount = 0;
994 int matrixIndex = matrixIndices[k];
997 for (
int l = 0; l < kCategoryCount; l++) {
998 Real* transposeOffset = tmpRealMatrixOffset;
1000 for (
int i = 0; i < kStateCount; i++) {
1006 beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
1007 tmpRealMatrixOffset += kPaddedStateCount;
1008 inMatrixOffset += kStateCount;
1011 transposeSquareMatrix(transposeOffset, kPaddedStateCount);
1012 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
1015 lumpedMatricesCount++;
1017 }
while ((k < count) && (matrixIndices[k] == matrixIndices[k-1] + 1) && (lumpedMatricesCount < BEAGLE_CACHED_MATRICES_COUNT));
1020 gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
1021 sizeof(Real) * kMatrixSize * kCategoryCount * lumpedMatricesCount);
1025 #ifdef BEAGLE_DEBUG_FLOW
1026 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setTransitionMatrices\n");
1029 return BEAGLE_SUCCESS;
1033 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updateTransitionMatrices(
int eigenIndex,
1034 const int* probabilityIndices,
1035 const int* firstDerivativeIndices,
1036 const int* secondDerivativeIndices,
1037 const double* edgeLengths,
1039 #ifdef BEAGLE_DEBUG_FLOW
1040 fprintf(stderr,
"\tEntering BeagleGPUImpl::updateTransitionMatrices\n");
1046 int indexOffset = kMatrixSize * kCategoryCount;
1047 int categoryOffset = kMatrixSize;
1051 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1052 for (
int i = 0; i < count; i++) {
1053 for (
int j = 0; j < kCategoryCount; j++) {
1054 hPtrQueue[totalCount] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1055 hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
1060 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * totalCount);
1061 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue,
sizeof(Real) * totalCount);
1064 kernels->GetTransitionProbabilitiesSquare(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1065 dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1066 }
else if (secondDerivativeIndices == NULL) {
1068 totalCount = count * kCategoryCount;
1070 for (
int i = 0; i < count; i++) {
1071 for (
int j = 0; j < kCategoryCount; j++) {
1072 hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1073 hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
1074 hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
1075 hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
1080 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * totalCount * 2);
1081 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue,
sizeof(Real) * totalCount * 2);
1083 kernels->GetTransitionProbabilitiesSquareFirstDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1084 dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1087 totalCount = count * kCategoryCount;
1089 for (
int i = 0; i < count; i++) {
1090 for (
int j = 0; j < kCategoryCount; j++) {
1091 hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1092 hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
1093 hPtrQueue[ptrIndex + totalCount*2] = secondDerivativeIndices[i] * indexOffset + j * categoryOffset;
1094 hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
1095 hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
1100 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * totalCount * 3);
1101 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue,
sizeof(Real) * totalCount * 2);
1103 kernels->GetTransitionProbabilitiesSquareSecondDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1104 dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1111 for (
int i = 0; i < count; i++) {
1112 for (
int j = 0; j < kCategoryCount; j++) {
1113 hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
1118 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue,
sizeof(Real) * totalCount);
1121 for (
int i = 0; i < count; i++) {
1122 kernels->GetTransitionProbabilitiesSquare(dMatrices[probabilityIndices[i]], dEvec[eigenIndex], dIevc[eigenIndex],
1123 dEigenValues[eigenIndex], dDistanceQueue, kCategoryCount,
1124 i * kCategoryCount);
1129 #ifdef BEAGLE_DEBUG_VALUES
1131 for (
int i = 0; i < 1; i++) {
1132 fprintf(stderr,
"dMatrices[probabilityIndices[%d]] (hDQ = %1.5e, eL = %1.5e) =\n", i,hDistanceQueue[i], edgeLengths[i]);
1133 gpu->PrintfDeviceVector(dMatrices[probabilityIndices[i]], kMatrixSize * kCategoryCount, r);
1134 for(
int j=0; j<kCategoryCount; j++)
1135 fprintf(stderr,
" %1.5f",hCategoryRates[j]);
1136 fprintf(stderr,
"\n");
1140 #ifdef BEAGLE_DEBUG_SYNCH
1144 #ifdef BEAGLE_DEBUG_FLOW
1145 fprintf(stderr,
"\tLeaving BeagleGPUImpl::updateTransitionMatrices\n");
1148 return BEAGLE_SUCCESS;
1152 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updatePartials(
const int* operations,
1154 int cumulativeScalingIndex) {
1156 #ifdef BEAGLE_DEBUG_FLOW
1157 fprintf(stderr,
"\tEntering BeagleGPUImpl::updatePartials\n");
1160 GPUPtr cumulativeScalingBuffer = 0;
1161 if (cumulativeScalingIndex != BEAGLE_OP_NONE)
1162 cumulativeScalingBuffer = dScalingFactors[cumulativeScalingIndex];
1165 for (
int op = 0; op < operationCount; op++) {
1166 const int parIndex = operations[op * 7];
1167 const int writeScalingIndex = operations[op * 7 + 1];
1168 const int readScalingIndex = operations[op * 7 + 2];
1169 const int child1Index = operations[op * 7 + 3];
1170 const int child1TransMatIndex = operations[op * 7 + 4];
1171 const int child2Index = operations[op * 7 + 5];
1172 const int child2TransMatIndex = operations[op * 7 + 6];
1174 GPUPtr matrices1 = dMatrices[child1TransMatIndex];
1175 GPUPtr matrices2 = dMatrices[child2TransMatIndex];
1177 GPUPtr partials1 = dPartials[child1Index];
1178 GPUPtr partials2 = dPartials[child2Index];
1180 GPUPtr partials3 = dPartials[parIndex];
1182 GPUPtr tipStates1 = dStates[child1Index];
1183 GPUPtr tipStates2 = dStates[child2Index];
1185 int rescale = BEAGLE_OP_NONE;
1186 GPUPtr scalingFactors = (GPUPtr)NULL;
1188 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1189 int sIndex = parIndex - kTipCount;
1191 if (tipStates1 == 0 && tipStates2 == 0) {
1193 scalingFactors = dScalingFactors[sIndex];
1195 }
else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1197 scalingFactors = dScalingFactors[parIndex - kTipCount];
1198 }
else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && writeScalingIndex >= 0) {
1200 scalingFactors = dScalingFactors[writeScalingIndex];
1201 }
else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && readScalingIndex >= 0) {
1203 scalingFactors = dScalingFactors[readScalingIndex];
1206 #ifdef BEAGLE_DEBUG_VALUES
1207 fprintf(stderr,
"kPaddedPatternCount = %d\n", kPaddedPatternCount);
1208 fprintf(stderr,
"kPatternCount = %d\n", kPatternCount);
1209 fprintf(stderr,
"categoryCount = %d\n", kCategoryCount);
1210 fprintf(stderr,
"partialSize = %d\n", kPartialsSize);
1211 fprintf(stderr,
"writeIndex = %d, readIndex = %d, rescale = %d\n",writeScalingIndex,readScalingIndex,rescale);
1212 fprintf(stderr,
"child1 = \n");
1215 gpu->PrintfDeviceInt(tipStates1, kPaddedPatternCount);
1217 gpu->PrintfDeviceVector(partials1, kPartialsSize, r);
1218 fprintf(stderr,
"child2 = \n");
1220 gpu->PrintfDeviceInt(tipStates2, kPaddedPatternCount);
1222 gpu->PrintfDeviceVector(partials2, kPartialsSize, r);
1223 fprintf(stderr,
"node index = %d\n", parIndex);
1226 if (tipStates1 != 0) {
1227 if (tipStates2 != 0 ) {
1228 kernels->StatesStatesPruningDynamicScaling(tipStates1, tipStates2, partials3,
1229 matrices1, matrices2, scalingFactors,
1230 cumulativeScalingBuffer,
1231 kPaddedPatternCount, kCategoryCount,
1234 kernels->StatesPartialsPruningDynamicScaling(tipStates1, partials2, partials3,
1235 matrices1, matrices2, scalingFactors,
1236 cumulativeScalingBuffer,
1237 kPaddedPatternCount, kCategoryCount,
1241 if (tipStates2 != 0) {
1242 kernels->StatesPartialsPruningDynamicScaling(tipStates2, partials1, partials3,
1243 matrices2, matrices1, scalingFactors,
1244 cumulativeScalingBuffer,
1245 kPaddedPatternCount, kCategoryCount,
1248 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1249 kernels->PartialsPartialsPruningDynamicCheckScaling(partials1, partials2, partials3,
1250 matrices1, matrices2, writeScalingIndex, readScalingIndex,
1251 cumulativeScalingIndex, dScalingFactors, dScalingFactorsMaster,
1252 kPaddedPatternCount, kCategoryCount,
1253 rescale, hRescalingTrigger, dRescalingTrigger,
sizeof(Real));
1255 kernels->PartialsPartialsPruningDynamicScaling(partials1, partials2, partials3,
1256 matrices1, matrices2, scalingFactors,
1257 cumulativeScalingBuffer,
1258 kPaddedPatternCount, kCategoryCount,
1264 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1265 int parScalingIndex = parIndex - kTipCount;
1266 int child1ScalingIndex = child1Index - kTipCount;
1267 int child2ScalingIndex = child2Index - kTipCount;
1268 if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
1269 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
1270 accumulateScaleFactors(scalingIndices, 2, parScalingIndex);
1271 }
else if (child1ScalingIndex >= 0) {
1272 int scalingIndices[1] = {child1ScalingIndex};
1273 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
1274 }
else if (child2ScalingIndex >= 0) {
1275 int scalingIndices[1] = {child2ScalingIndex};
1276 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
1280 #ifdef BEAGLE_DEBUG_VALUES
1282 fprintf(stderr,
"scalars = ");
1283 gpu->PrintfDeviceVector(scalingFactors,kPaddedPatternCount, r);
1285 fprintf(stderr,
"parent = \n");
1287 if (writeScalingIndex == -1)
1288 gpu->PrintfDeviceVector(partials3, kPartialsSize, r);
1290 gpu->PrintfDeviceVector(partials3, kPartialsSize, 1.0, &signal, r);
1294 #ifdef BEAGLE_DEBUG_SYNCH
1298 #ifdef BEAGLE_DEBUG_FLOW
1299 fprintf(stderr,
"\tLeaving BeagleGPUImpl::updatePartials\n");
1302 return BEAGLE_SUCCESS;
1306 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::waitForPartials(
const int* ,
1308 #ifdef BEAGLE_DEBUG_FLOW
1309 fprintf(stderr,
"\tEntering BeagleGPUImpl::waitForPartials\n");
1312 #ifdef BEAGLE_DEBUG_FLOW
1313 fprintf(stderr,
"\tLeaving BeagleGPUImpl::waitForPartials\n");
1316 return BEAGLE_SUCCESS;
1320 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::accumulateScaleFactors(
const int* scalingIndices,
1322 int cumulativeScalingIndex) {
1323 #ifdef BEAGLE_DEBUG_FLOW
1324 fprintf(stderr,
"\tEntering BeagleGPUImpl::accumulateScaleFactors\n");
1327 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1328 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1329 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1331 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1335 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1337 for(
int n = 0; n < count; n++) {
1338 int sIndex = scalingIndices[n] - kTipCount;
1339 hPtrQueue[n] = sIndex;
1342 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1344 kernels->AccumulateFactorsAutoScaling(dScalingFactors[0], dPtrQueue, dAccumulatedScalingFactors, count, kPaddedPatternCount, kScaleBufferSize);
1347 for(
int n = 0; n < count; n++)
1348 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1349 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1354 kernels->AccumulateFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex], count, kPaddedPatternCount);
1356 for (
int i = 0; i < count; i++) {
1357 kernels->AccumulateFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1358 1, kPaddedPatternCount);
1363 #ifdef BEAGLE_DEBUG_SYNCH
1367 #ifdef BEAGLE_DEBUG_VALUES
1369 fprintf(stderr,
"scaling factors = ");
1370 gpu->PrintfDeviceVector(dScalingFactors[cumulativeScalingIndex], kPaddedPatternCount, r);
1373 #ifdef BEAGLE_DEBUG_FLOW
1374 fprintf(stderr,
"\tLeaving BeagleGPUImpl::accumulateScaleFactors\n");
1377 return BEAGLE_SUCCESS;
1381 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::removeScaleFactors(
const int* scalingIndices,
1383 int cumulativeScalingIndex) {
1385 #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
14/span> #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue,