HMSBEAGLE  1.0.0
BeagleCPUImpl.hpp
1 /*
2  * BeagleCPUImpl.cpp
3  * BEAGLE
4  *
5  * Copyright 2009 Phylogenetic Likelihood Working Group
6  *
7  * This file is part of BEAGLE.
8  *
9  * BEAGLE is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as
11  * published by the Free Software Foundation, either version 3 of
12  * the License, or (at your option) any later version.
13  *
14  * BEAGLE is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with BEAGLE. If not, see
21  * <http://www.gnu.org/licenses/>.
22  *
23  * @author Andrew Rambaut
24  * @author Marc Suchard
25  * @author Daniel Ayres
26  * @author Mark Holder
27  */
28 
30 // so that we can flag them. This would this would be helpful for
31 // implementing:
32 // 1. an error-checking version that double-checks (to the extent
33 // possible) that the client is using the API correctly. This would
34 // ideally be a conditional compilation variant (so that we do
35 // not normally incur runtime penalties, but can enable it to help
36 // find bugs).
37 // 2. a multithreading impl that checks dependencies before queuing
38 // partials.
39 
41 // would be trivial for this impl, and would be easier for clients that want
42 // to cache partial like calculations for a indeterminate number of trees.
44 // void waitForPartials(int* instance;
45 // int instanceCount;
46 // int* parentPartialIndex;
47 // int partialCount;
48 // );
49 // method that blocks until the partials are valid would be important for
50 // clients (such as GARLI) that deal with big trees by overwriting some temporaries.
52 // but MTH did want to record it for posterity). We could add following
53 // calls:
55 // BeagleReturnCodes swapEigens(int instance, int *firstInd, int *secondInd, int count);
56 // BeagleReturnCodes swapTransitionMatrices(int instance, int *firstInd, int *secondInd, int count);
57 // BeagleReturnCodes swapPartials(int instance, int *firstInd, int *secondInd, int count);
59 // They would be optional for the client but could improve efficiency if:
60 // 1. The impl is load balancing, AND
61 // 2. The client code, uses the calls to synchronize the indexing of temporaries
62 // between instances such that you can pass an instanceIndices list with
63 // multiple entries to updatePartials.
64 // These seem too nitty gritty and low-level, but they also make it easy to
65 // write a wrapper geared toward MCMC (during a move, cache the old data
66 // in an unused array, after a rejection swap back to the cached copy)
67 
68 #ifndef BEAGLE_CPU_IMPL_GENERAL_HPP
69 #define BEAGLE_CPU_IMPL_GENERAL_HPP
70 
71 #ifdef HAVE_CONFIG_H
72 #include "libhmsbeagle/config.h"
73 #endif
74 
75 #include <cstdio>
76 #include <cstdlib>
77 #include <iostream>
78 #include <cstring>
79 #include <cmath>
80 #include <cassert>
81 #include <vector>
82 #include <cfloat>
83 
84 #include "libhmsbeagle/beagle.h"
85 #include "libhmsbeagle/CPU/Precision.h"
86 #include "libhmsbeagle/CPU/BeagleCPUImpl.h"
87 #include "libhmsbeagle/CPU/EigenDecompositionCube.h"
88 #include "libhmsbeagle/CPU/EigenDecompositionSquare.h"
89 
90 namespace beagle {
91 namespace cpu {
92 
93 //#if defined (BEAGLE_IMPL_DEBUGGING_OUTPUT) && BEAGLE_IMPL_DEBUGGING_OUTPUT
94 //const bool DEBUGGING_OUTPUT = true;
95 //#else
96 //const bool DEBUGGING_OUTPUT = false;
97 //#endif
98 
99 BEAGLE_CPU_FACTORY_TEMPLATE
100 inline const char* getBeagleCPUName(){ return "CPU-Unknown"; };
101 
102 template<>
103 inline const char* getBeagleCPUName<double>(){ return "CPU-Double"; };
104 
105 template<>
106 inline const char* getBeagleCPUName<float>(){ return "CPU-Single"; };
107 
108 BEAGLE_CPU_FACTORY_TEMPLATE
109 inline const long getBeagleCPUFlags(){ return BEAGLE_FLAG_COMPUTATION_SYNCH; };
110 
111 template<>
112 inline const long getBeagleCPUFlags<double>(){ return BEAGLE_FLAG_COMPUTATION_SYNCH |
113  BEAGLE_FLAG_THREADING_NONE |
114  BEAGLE_FLAG_PROCESSOR_CPU |
115  BEAGLE_FLAG_PRECISION_DOUBLE |
116  BEAGLE_FLAG_VECTOR_NONE; };
117 
118 template<>
119 inline const long getBeagleCPUFlags<float>(){ return BEAGLE_FLAG_COMPUTATION_SYNCH |
120  BEAGLE_FLAG_THREADING_NONE |
121  BEAGLE_FLAG_PROCESSOR_CPU |
122  BEAGLE_FLAG_PRECISION_SINGLE |
123  BEAGLE_FLAG_VECTOR_NONE; };
124 
125 
126 
127 BEAGLE_CPU_TEMPLATE
128 BeagleCPUImpl<BEAGLE_CPU_GENERIC>::~BeagleCPUImpl() {
129  // free all that stuff...
130  // If you delete partials, make sure not to delete the last element
131  // which is TEMP_SCRATCH_PARTIAL twice.
132 
133  for(unsigned int i=0; i<kEigenDecompCount; i++) {
134  if (gCategoryWeights[i] != NULL)
135  free(gCategoryWeights[i]);
136  if (gStateFrequencies[i] != NULL)
137  free(gStateFrequencies[i]);
138  }
139 
140  for(unsigned int i=0; i<kMatrixCount; i++) {
141  if (gTransitionMatrices[i] != NULL)
142  free(gTransitionMatrices[i]);
143  }
144  free(gTransitionMatrices);
145 
146  for(unsigned int i=0; i<kBufferCount; i++) {
147  if (gPartials[i] != NULL)
148  free(gPartials[i]);
149  if (gTipStates[i] != NULL)
150  free(gTipStates[i]);
151  }
152  free(gPartials);
153  free(gTipStates);
154 
155  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
156  for(unsigned int i=0; i<kScaleBufferCount; i++) {
157  if (gAutoScaleBuffers[i] != NULL)
158  free(gAutoScaleBuffers[i]);
159  }
160  if (gAutoScaleBuffers)
161  free(gAutoScaleBuffers);
162  free(gActiveScalingFactors);
163  if (gScaleBuffers[0] != NULL)
164  free(gScaleBuffers[0]);
165  } else {
166  for(unsigned int i=0; i<kScaleBufferCount; i++) {
167  if (gScaleBuffers[i] != NULL)
168  free(gScaleBuffers[i]);
169  }
170  }
171 
172  if (gScaleBuffers)
173  free(gScaleBuffers);
174 
175  free(gCategoryRates);
176  free(gPatternWeights);
177 
178  free(integrationTmp);
179  free(firstDerivTmp);
180  free(secondDerivTmp);
181 
182  free(outLogLikelihoodsTmp);
183  free(outFirstDerivativesTmp);
184  free(outSecondDerivativesTmp);
185 
186  free(ones);
187  free(zeros);
188 
189  delete gEigenDecomposition;
190 }
191 
192 BEAGLE_CPU_TEMPLATE
193 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::createInstance(int tipCount,
194  int partialsBufferCount,
195  int compactBufferCount,
196  int stateCount,
197  int patternCount,
198  int eigenDecompositionCount,
199  int matrixCount,
200  int categoryCount,
201  int scaleBufferCount,
202  int resourceNumber,
203  long preferenceFlags,
204  long requirementFlags) {
205  if (DEBUGGING_OUTPUT)
206  std::cerr << "in BeagleCPUImpl::initialize\n" ;
207 
208  if (DOUBLE_PRECISION) {
209  realtypeMin = DBL_MIN;
210  scalingExponentThreshhold = 200;
211  } else {
212  realtypeMin = FLT_MIN;
213  scalingExponentThreshhold = 20;
214  }
215 
216  kBufferCount = partialsBufferCount + compactBufferCount;
217  kTipCount = tipCount;
218  assert(kBufferCount > kTipCount);
219  kStateCount = stateCount;
220  kPatternCount = patternCount;
221 
222  kInternalPartialsBufferCount = kBufferCount - kTipCount;
223 
224  kTransPaddedStateCount = kStateCount + T_PAD;
225  kPartialsPaddedStateCount = kStateCount + P_PAD;
226 
227  // Handle possible padding of pattern sites for vectorization
228  int modulus = getPaddedPatternsModulus();
229  kPaddedPatternCount = kPatternCount;
230  int remainder = kPatternCount % modulus;
231  if (remainder != 0) {
232  kPaddedPatternCount += modulus - remainder;
233  }
234  kExtraPatterns = kPaddedPatternCount - kPatternCount;
235 
236  kMatrixCount = matrixCount;
237  kEigenDecompCount = eigenDecompositionCount;
238  kCategoryCount = categoryCount;
239  kScaleBufferCount = scaleBufferCount;
240 
241  kMatrixSize = (T_PAD + kStateCount) * kStateCount;
242 
243  int scaleBufferSize = kPaddedPatternCount;
244 
245  kFlags = 0;
246 
247  if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) {
248  kFlags |= BEAGLE_FLAG_SCALING_AUTO;
249  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
250  kScaleBufferCount = kInternalPartialsBufferCount;
251  } else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
252  kFlags |= BEAGLE_FLAG_SCALING_ALWAYS;
253  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
254  kScaleBufferCount = kInternalPartialsBufferCount + 1; // +1 for temp buffer used by edgelikelihood
255  } else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
256  kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC;
257  kFlags |= BEAGLE_FLAG_SCALERS_RAW;
258  } else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) {
259  kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
260  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
261  } else {
262  kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
263  kFlags |= BEAGLE_FLAG_SCALERS_RAW;
264  }
265 
266  if (requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX || preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
267  kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX;
268  else
269  kFlags |= BEAGLE_FLAG_EIGEN_REAL;
270 
271  if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED)
272  kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED;
273  else
274  kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD;
275 
276  if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
277  gEigenDecomposition = new EigenDecompositionSquare<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount,
278  kStateCount,kCategoryCount,kFlags);
279  else
280  gEigenDecomposition = new EigenDecompositionCube<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount,
281  kStateCount, kCategoryCount,kFlags);
282 
283  gCategoryRates = (double*) malloc(sizeof(double) * kCategoryCount);
284  if (gCategoryRates == NULL)
285  throw std::bad_alloc();
286 
287  gPatternWeights = (double*) malloc(sizeof(double) * kPatternCount);
288  if (gPatternWeights == NULL)
289  throw std::bad_alloc();
290 
291  // TODO: if pattern padding is implemented this will create problems with setTipPartials
292  kPartialsSize = kPaddedPatternCount * kPartialsPaddedStateCount * kCategoryCount;
293 
294  gPartials = (REALTYPE**) malloc(sizeof(REALTYPE*) * kBufferCount);
295  if (gPartials == NULL)
296  throw std::bad_alloc();
297 
298  gStateFrequencies = (REALTYPE**) calloc(sizeof(REALTYPE*), kEigenDecompCount);
299  if (gStateFrequencies == NULL)
300  throw std::bad_alloc();
301 
302  gCategoryWeights = (REALTYPE**) calloc(sizeof(REALTYPE*), kEigenDecompCount);
303  if (gCategoryWeights == NULL)
304  throw std::bad_alloc();
305 
306  // assigning kBufferCount to this array so that we can just check if a tipStateBuffer is
307  // allocated
308  gTipStates = (int**) malloc(sizeof(int*) * kBufferCount);
309  if (gTipStates == NULL)
310  throw std::bad_alloc();
311 
312  for (int i = 0; i < kBufferCount; i++) {
313  gPartials[i] = NULL;
314  gTipStates[i] = NULL;
315  }
316 
317  for (int i = kTipCount; i < kBufferCount; i++) {
318  gPartials[i] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPartialsSize);
319  if (gPartials[i] == NULL)
320  throw std::bad_alloc();
321  }
322 
323  gScaleBuffers = NULL;
324 
325  gAutoScaleBuffers = NULL;
326 
327  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
328  gAutoScaleBuffers = (signed short**) malloc(sizeof(signed short*) * kScaleBufferCount);
329  if (gAutoScaleBuffers == NULL)
330  throw std::bad_alloc();
331  for (int i = 0; i < kScaleBufferCount; i++) {
332  gAutoScaleBuffers[i] = (signed short*) malloc(sizeof(signed short) * scaleBufferSize);
333  if (gAutoScaleBuffers[i] == 0L)
334  throw std::bad_alloc();
335  }
336  gActiveScalingFactors = (int*) malloc(sizeof(int) * kInternalPartialsBufferCount);
337  gScaleBuffers = (REALTYPE**) malloc(sizeof(REALTYPE*));
338  gScaleBuffers[0] = (REALTYPE*) malloc(sizeof(REALTYPE) * scaleBufferSize);
339  } else {
340  gScaleBuffers = (REALTYPE**) malloc(sizeof(REALTYPE*) * kScaleBufferCount);
341  if (gScaleBuffers == NULL)
342  throw std::bad_alloc();
343 
344  for (int i = 0; i < kScaleBufferCount; i++) {
345  gScaleBuffers[i] = (REALTYPE*) malloc(sizeof(REALTYPE) * scaleBufferSize);
346 
347  if (gScaleBuffers[i] == 0L)
348  throw std::bad_alloc();
349 
350 
351  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
352  for (int j=0; j < scaleBufferSize; j++) {
353  gScaleBuffers[i][j] = 1.0;
354  }
355  }
356  }
357  }
358 
359 
360  gTransitionMatrices = (REALTYPE**) malloc(sizeof(REALTYPE*) * kMatrixCount);
361  if (gTransitionMatrices == NULL)
362  throw std::bad_alloc();
363  for (int i = 0; i < kMatrixCount; i++) {
364  gTransitionMatrices[i] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kMatrixSize * kCategoryCount);
365  if (gTransitionMatrices[i] == 0L)
366  throw std::bad_alloc();
367  }
368 
369  integrationTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPatternCount * kStateCount);
370  firstDerivTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
371  secondDerivTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
372 
373  outLogLikelihoodsTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
374  outFirstDerivativesTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
375  outSecondDerivativesTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
376 
377  zeros = (REALTYPE*) malloc(sizeof(REALTYPE) * kPaddedPatternCount);
378  ones = (REALTYPE*) malloc(sizeof(REALTYPE) * kPaddedPatternCount);
379  for(int i = 0; i < kPaddedPatternCount; i++) {
380  zeros[i] = 0.0;
381  ones[i] = 1.0;
382  }
383 
384  return BEAGLE_SUCCESS;
385 }
386 
387 BEAGLE_CPU_TEMPLATE
388 const char* BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getName() {
389  return getBeagleCPUName<BEAGLE_CPU_FACTORY_GENERIC>();
390 }
391 
392 BEAGLE_CPU_TEMPLATE
393 const long BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getFlags() {
394  return getBeagleCPUFlags<BEAGLE_CPU_FACTORY_GENERIC>();
395 }
396 
397 BEAGLE_CPU_TEMPLATE
398 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getInstanceDetails(BeagleInstanceDetails* returnInfo) {
399  if (returnInfo != NULL) {
400  returnInfo->resourceNumber = 0;
401  returnInfo->flags = getFlags();
402  returnInfo->flags |= kFlags;
403 
404  returnInfo->implName = (char*) getName();
405  }
406 
407  return BEAGLE_SUCCESS;
408 }
409 
410 BEAGLE_CPU_TEMPLATE
411 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipStates(int tipIndex,
412  const int* inStates) {
413  if (tipIndex < 0 || tipIndex >= kTipCount)
414  return BEAGLE_ERROR_OUT_OF_RANGE;
415  gTipStates[tipIndex] = (int*) mallocAligned(sizeof(int) * kPaddedPatternCount);
416  // TODO: What if this throws a memory full error?
417  for (int j = 0; j < kPatternCount; j++) {
418  gTipStates[tipIndex][j] = (inStates[j] < kStateCount ? inStates[j] : kStateCount);
419  }
420  for (int j = kPatternCount; j < kPaddedPatternCount; j++) {
421  gTipStates[tipIndex][j] = kStateCount;
422  }
423 
424  return BEAGLE_SUCCESS;
425 }
426 
427 BEAGLE_CPU_TEMPLATE
428 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipPartials(int tipIndex,
429  const double* inPartials) {
430  if (tipIndex < 0 || tipIndex >= kTipCount)
431  return BEAGLE_ERROR_OUT_OF_RANGE;
432  if(gPartials[tipIndex] == NULL) {
433  gPartials[tipIndex] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPartialsSize);
434  // TODO: What if this throws a memory full error?
435  if (gPartials[tipIndex] == 0L)
436  return BEAGLE_ERROR_OUT_OF_MEMORY;
437  }
438 
439  const double* inPartialsOffset;
440  REALTYPE* tmpRealPartialsOffset = gPartials[tipIndex];
441  for (int l = 0; l < kCategoryCount; l++) {
442  inPartialsOffset = inPartials;
443  for (int i = 0; i < kPatternCount; i++) {
444  beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
445  tmpRealPartialsOffset += kPartialsPaddedStateCount;
446  inPartialsOffset += kStateCount;
447  }
448  // Pad extra buffer with zeros
449  for(int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) {
450  *tmpRealPartialsOffset++ = 0;
451  }
452  }
453 
454  return BEAGLE_SUCCESS;
455 }
456 
457 BEAGLE_CPU_TEMPLATE
458 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPartials(int bufferIndex,
459  const double* inPartials) {
460  if (bufferIndex < 0 || bufferIndex >= kBufferCount)
461  return BEAGLE_ERROR_OUT_OF_RANGE;
462  if (gPartials[bufferIndex] == NULL) {
463  gPartials[bufferIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kPartialsSize);
464  if (gPartials[bufferIndex] == 0L)
465  return BEAGLE_ERROR_OUT_OF_MEMORY;
466  }
467 
468  const double* inPartialsOffset = inPartials;
469  REALTYPE* tmpRealPartialsOffset = gPartials[bufferIndex];
470  for (int l = 0; l < kCategoryCount; l++) {
471  for (int i = 0; i < kPatternCount; i++) {
472  beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
473  tmpRealPartialsOffset += kPartialsPaddedStateCount;
474  inPartialsOffset += kStateCount;
475  }
476  // Pad extra buffer with zeros
477  for(int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) {
478  *tmpRealPartialsOffset++ = 0;
479  }
480  }
481 
482  return BEAGLE_SUCCESS;
483 }
484 
485 BEAGLE_CPU_TEMPLATE
486 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getPartials(int bufferIndex,
487  int cumulativeScaleIndex,
488  double* outPartials) {
489  // TODO: Make this work with partials padding
490 
491  // TODO: Test with and without padding
492  if (bufferIndex < 0 || bufferIndex >= kBufferCount)
493  return BEAGLE_ERROR_OUT_OF_RANGE;
494 
495  if (kPatternCount == kPaddedPatternCount) {
496  beagleMemCpy(outPartials, gPartials[bufferIndex], kPartialsSize);
497  } else { // Need to remove padding
498  double *offsetOutPartials;
499  REALTYPE* offsetBeaglePartials = gPartials[bufferIndex];
500  for(int i = 0; i < kCategoryCount; i++) {
501  beagleMemCpy(offsetOutPartials,offsetBeaglePartials,
502  kPatternCount * kStateCount);
503  offsetOutPartials += kPatternCount * kStateCount;
504  offsetBeaglePartials += kPaddedPatternCount * kStateCount;
505  }
506  }
507 
508  if (cumulativeScaleIndex != BEAGLE_OP_NONE) {
509  REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex];
510  int index = 0;
511  for(int k=0; k<kPatternCount; k++) {
512  REALTYPE scaleFactor = exp(cumulativeScaleBuffer[k]);
513  for(int i=0; i<kStateCount; i++) {
514  outPartials[index] *= scaleFactor;
515  index++;
516  }
517  }
518  // TODO: Do we assume the cumulativeScaleBuffer is on the log-scale?
519  }
520 
521  return BEAGLE_SUCCESS;
522 }
523 
524 BEAGLE_CPU_TEMPLATE
525 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setEigenDecomposition(int eigenIndex,
526  const double* inEigenVectors,
527  const double* inInverseEigenVectors,
528  const double* inEigenValues) {
529 
530  gEigenDecomposition->setEigenDecomposition(eigenIndex, inEigenVectors, inInverseEigenVectors, inEigenValues);
531  return BEAGLE_SUCCESS;
532 }
533 
534 BEAGLE_CPU_TEMPLATE
535 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryRates(const double* inCategoryRates) {
536  memcpy(gCategoryRates, inCategoryRates, sizeof(double) * kCategoryCount);
537  return BEAGLE_SUCCESS;
538 }
539 
540 BEAGLE_CPU_TEMPLATE
541 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPatternWeights(const double* inPatternWeights) {
542  assert(inPatternWeights != 0L);
543  memcpy(gPatternWeights, inPatternWeights, sizeof(double) * kPatternCount);
544  return BEAGLE_SUCCESS;
545 }
546 
547 BEAGLE_CPU_TEMPLATE
548  int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setStateFrequencies(int stateFrequenciesIndex,
549  const double* inStateFrequencies) {
550  if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount)
551  return BEAGLE_ERROR_OUT_OF_RANGE;
552  if (gStateFrequencies[stateFrequenciesIndex] == NULL) {
553  gStateFrequencies[stateFrequenciesIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kStateCount);
554  if (gStateFrequencies[stateFrequenciesIndex] == 0L)
555  return BEAGLE_ERROR_OUT_OF_MEMORY;
556  }
557  beagleMemCpy(gStateFrequencies[stateFrequenciesIndex], inStateFrequencies, kStateCount);
558 
559  return BEAGLE_SUCCESS;
560 }
561 
562 BEAGLE_CPU_TEMPLATE
563 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryWeights(int categoryWeightsIndex,
564  const double* inCategoryWeights) {
565  if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount)
566  return BEAGLE_ERROR_OUT_OF_RANGE;
567  if (gCategoryWeights[categoryWeightsIndex] == NULL) {
568  gCategoryWeights[categoryWeightsIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kCategoryCount);
569  if (gCategoryWeights[categoryWeightsIndex] == 0L)
570  return BEAGLE_ERROR_OUT_OF_MEMORY;
571  }
572  beagleMemCpy(gCategoryWeights[categoryWeightsIndex], inCategoryWeights, kCategoryCount);
573 
574  return BEAGLE_SUCCESS;
575 }
576 
577 BEAGLE_CPU_TEMPLATE
578 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getTransitionMatrix(int matrixIndex,
579  double* outMatrix) {
580  // TODO Test with multiple rate categories
581 if (T_PAD != 0) {
582  double* offsetOutMatrix = outMatrix;
583  REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
584  for(int i = 0; i < kCategoryCount; i++) {
585  for(int j = 0; j < kStateCount; j++) {
586  beagleMemCpy(offsetOutMatrix,offsetBeagleMatrix,kStateCount);
587  offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding
588  offsetOutMatrix += kStateCount;
589  }
590  }
591 } else {
592  beagleMemCpy(outMatrix,gTransitionMatrices[matrixIndex],
593  kMatrixSize * kCategoryCount);
594 }
595  return BEAGLE_SUCCESS;
596 }
597 
598 BEAGLE_CPU_TEMPLATE
599 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteLogLikelihoods(double* outLogLikelihoods) {
600  beagleMemCpy(outLogLikelihoods, outLogLikelihoodsTmp, kPatternCount);
601 
602  return BEAGLE_SUCCESS;
603 }
604 
605 BEAGLE_CPU_TEMPLATE
606 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteDerivatives(double* outFirstDerivatives,
607  double* outSecondDerivatives) {
608  beagleMemCpy(outFirstDerivatives, outFirstDerivativesTmp, kPatternCount);
609  if (outSecondDerivatives != NULL)
610  beagleMemCpy(outSecondDerivatives, outSecondDerivativesTmp, kPatternCount);
611 
612  return BEAGLE_SUCCESS;
613 }
614 
615 
616 BEAGLE_CPU_TEMPLATE
617 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrix(int matrixIndex,
618  const double* inMatrix,
619  double paddedValue) {
620 
621 if (T_PAD != 0) {
622  const double* offsetInMatrix = inMatrix;
623  REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
624  for(int i = 0; i < kCategoryCount; i++) {
625  for(int j = 0; j < kStateCount; j++) {
626  beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount);
627  offsetBeagleMatrix[kStateCount] = paddedValue;
628  offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding
629  offsetInMatrix += kStateCount;
630  }
631  }
632 } else {
633  beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix,
634  kMatrixSize * kCategoryCount);
635 }
636  return BEAGLE_SUCCESS;
637 }
638 
639 BEAGLE_CPU_TEMPLATE
640 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrices(const int* matrixIndices,
641  const double* inMatrices,
642  const double* paddedValues,
643  int count) {
644  for (int k = 0; k < count; k++) {
645  const double* inMatrix = inMatrices + k*kStateCount*kStateCount*kCategoryCount;
646  int matrixIndex = matrixIndices[k];
647 
648 if (T_PAD != 0) {
649  const double* offsetInMatrix = inMatrix;
650  REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
651  for(int i = 0; i < kCategoryCount; i++) {
652  for(int j = 0; j < kStateCount; j++) {
653  beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount);
654  offsetBeagleMatrix[kStateCount] = paddedValues[k];
655  offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding
656  offsetInMatrix += kStateCount;
657  }
658  }
659 } else {
660  beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix,
661  kMatrixSize * kCategoryCount);
662 }
663  }
664 
665  return BEAGLE_SUCCESS;
666 }
667 
668 BEAGLE_CPU_TEMPLATE
669 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updateTransitionMatrices(int eigenIndex,
670  const int* probabilityIndices,
671  const int* firstDerivativeIndices,
672  const int* secondDerivativeIndices,
673  const double* edgeLengths,
674  int count) {
675  gEigenDecomposition->updateTransitionMatrices(eigenIndex,probabilityIndices,firstDerivativeIndices,secondDerivativeIndices,
676  edgeLengths,gCategoryRates,gTransitionMatrices,count);
677  return BEAGLE_SUCCESS;
678 }
679 
680 BEAGLE_CPU_TEMPLATE
681 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updatePartials(const int* operations,
682  int count,
683  int cumulativeScaleIndex) {
684 
685  REALTYPE* cumulativeScaleBuffer = NULL;
686  if (cumulativeScaleIndex != BEAGLE_OP_NONE)
687  cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex];
688 
689  for (int op = 0; op < count; op++) {
690  if (DEBUGGING_OUTPUT) {
691  std::cerr << "op[0]= " << operations[0] << "\n";
692  std::cerr << "op[1]= " << operations[1] << "\n";
693  std::cerr << "op[2]= " << operations[2] << "\n";
694  std::cerr << "op[3]= " << operations[3] << "\n";
695  std::cerr << "op[4]= " << operations[4] << "\n";
696  std::cerr << "op[5]= " << operations[5] << "\n";
697  std::cerr << "op[6]= " << operations[6] << "\n";
698  }
699 
700  const int parIndex = operations[op * 7];
701  const int writeScalingIndex = operations[op * 7 + 1];
702  const int readScalingIndex = operations[op * 7 + 2];
703  const int child1Index = operations[op * 7 + 3];
704  const int child1TransMatIndex = operations[op * 7 + 4];
705  const int child2Index = operations[op * 7 + 5];
706  const int child2TransMatIndex = operations[op * 7 + 6];
707 
708  const REALTYPE* partials1 = gPartials[child1Index];
709  const REALTYPE* partials2 = gPartials[child2Index];
710 
711  const int* tipStates1 = gTipStates[child1Index];
712  const int* tipStates2 = gTipStates[child2Index];
713 
714  const REALTYPE* matrices1 = gTransitionMatrices[child1TransMatIndex];
715  const REALTYPE* matrices2 = gTransitionMatrices[child2TransMatIndex];
716 
717  REALTYPE* destPartials = gPartials[parIndex];
718 
719  int rescale = BEAGLE_OP_NONE;
720  REALTYPE* scalingFactors = NULL;
721 
722  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
723  gActiveScalingFactors[parIndex - kTipCount] = 0;
724  if (tipStates1 == 0 && tipStates2 == 0)
725  rescale = 2;
726  } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
727  rescale = 1;
728  scalingFactors = gScaleBuffers[parIndex - kTipCount];
729  } else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { // TODO: this is a quick and dirty implementation just so it returns correct results
730  if (tipStates1 == 0 && tipStates2 == 0) {
731  rescale = 1;
732  removeScaleFactors(&readScalingIndex, 1, cumulativeScaleIndex);
733  scalingFactors = gScaleBuffers[writeScalingIndex];
734  }
735  } else if (writeScalingIndex >= 0) {
736  rescale = 1;
737  scalingFactors = gScaleBuffers[writeScalingIndex];
738  } else if (readScalingIndex >= 0) {
739  rescale = 0;
740  scalingFactors = gScaleBuffers[readScalingIndex];
741  }
742 
743  if (DEBUGGING_OUTPUT) {
744  std::cerr << "Rescale= " << rescale << " writeIndex= " << writeScalingIndex
745  << " readIndex = " << readScalingIndex << "\n";
746  }
747 
748  if (tipStates1 != NULL) {
749  if (tipStates2 != NULL ) {
750  if (rescale == 0) { // Use fixed scaleFactors
751  calcStatesStatesFixedScaling(destPartials, tipStates1, matrices1, tipStates2, matrices2,
752  scalingFactors);
753  } else {
754  // First compute without any scaling
755  calcStatesStates(destPartials, tipStates1, matrices1, tipStates2, matrices2);
756  if (rescale == 1) // Recompute scaleFactors
757  rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
758  }
759  } else {
760  if (rescale == 0) {
761  calcStatesPartialsFixedScaling(destPartials, tipStates1, matrices1, partials2, matrices2,
762  scalingFactors);
763  } else {
764  calcStatesPartials(destPartials, tipStates1, matrices1, partials2, matrices2);
765  if (rescale == 1)
766  rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
767  }
768  }
769  } else {
770  if (tipStates2 != NULL) {
771  if (rescale == 0) {
772  calcStatesPartialsFixedScaling(destPartials,tipStates2,matrices2,partials1,matrices1,
773  scalingFactors);
774  } else {
775  calcStatesPartials(destPartials, tipStates2, matrices2, partials1, matrices1);
776  if (rescale == 1)
777  rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
778  }
779  } else {
780  if (rescale == 2) {
781  int sIndex = parIndex - kTipCount;
782  calcPartialsPartialsAutoScaling(destPartials,partials1,matrices1,partials2,matrices2,
783  &gActiveScalingFactors[sIndex]);
784  if (gActiveScalingFactors[sIndex])
785  autoRescalePartials(destPartials, gAutoScaleBuffers[sIndex]);
786 
787  } else if (rescale == 0) {
788  calcPartialsPartialsFixedScaling(destPartials,partials1,matrices1,partials2,matrices2,
789  scalingFactors);
790  } else {
791  calcPartialsPartials(destPartials, partials1, matrices1, partials2, matrices2);
792  if (rescale == 1)
793  rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
794  }
795  }
796  }
797 
798  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
799  int parScalingIndex = parIndex - kTipCount;
800  int child1ScalingIndex = child1Index - kTipCount;
801  int child2ScalingIndex = child2Index - kTipCount;
802  if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
803  int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
804  accumulateScaleFactors(scalingIndices, 2, parScalingIndex);
805  } else if (child1ScalingIndex >= 0) {
806  int scalingIndices[1] = {child1ScalingIndex};
807  accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
808  } else if (child2ScalingIndex >= 0) {
809  int scalingIndices[1] = {child2ScalingIndex};
810  accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
811  }
812  }
813 
814  if (DEBUGGING_OUTPUT) {
815  if (scalingFactors != NULL && rescale == 0) {
816  for(int i=0; i<kPatternCount; i++)
817  fprintf(stderr,"old scaleFactor[%d] = %.5f\n",i,scalingFactors[i]);
818  }
819  fprintf(stderr,"Result partials:\n");
820  for(int i = 0; i < kPartialsSize; i++)
821  fprintf(stderr,"destP[%d] = %.5f\n",i,destPartials[i]);
822  }
823  }
824 
825  return BEAGLE_SUCCESS;
826 }
827 
828 
829 BEAGLE_CPU_TEMPLATE
830 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::waitForPartials(const int* destinationPartials,
831  int destinationPartialsCount) {
832  return BEAGLE_SUCCESS;
833 }
834 
835 
836 BEAGLE_CPU_TEMPLATE
837  int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateRootLogLikelihoods(const int* bufferIndices,
838  const int* categoryWeightsIndices,
839  const int* stateFrequenciesIndices,
840  const int* cumulativeScaleIndices,
841  int count,
842  double* outSumLogLikelihood) {
843 
844  if (count == 1) {
845  // We treat this as a special case so that we don't have convoluted logic
846  // at the end of the loop over patterns
847  int cumulativeScalingFactorIndex;
848  if (kFlags & BEAGLE_FLAG_SCALING_AUTO)
849  cumulativeScalingFactorIndex = 0;
850  else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
851  cumulativeScalingFactorIndex = bufferIndices[0] - kTipCount;
852  else
853  cumulativeScalingFactorIndex = cumulativeScaleIndices[0];
854  return calcRootLogLikelihoods(bufferIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0],
855  cumulativeScalingFactorIndex, outSumLogLikelihood);
856  }
857  else
858  {
859  return calcRootLogLikelihoodsMulti(bufferIndices, categoryWeightsIndices, stateFrequenciesIndices,
860  cumulativeScaleIndices, count, outSumLogLikelihood);
861  }
862 }
863 
864 BEAGLE_CPU_TEMPLATE
865 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoodsMulti(const int* bufferIndices,
866  const int* categoryWeightsIndices,
867  const int* stateFrequenciesIndices,
868  const int* scaleBufferIndices,
869  int count,
870  double* outSumLogLikelihood) {
871  // Here we do the 3 similar operations:
872  // 1. to set the lnL to the contribution of the first subset,
873  // 2. to add the lnL for other subsets up to the penultimate
874  // 3. to add the final subset and take the lnL
875  // This form of the calc would not work when count == 1 because
876  // we need operation 1 and 3 in the preceding list. This is not
877  // a problem, though as we deal with count == 1 in the previous
878  // branch.
879 
880  std::vector<int> indexMaxScale(kPatternCount);
881  std::vector<REALTYPE> maxScaleFactor(kPatternCount);
882 
883  int returnCode = BEAGLE_SUCCESS;
884 
885  for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
886  const int rootPartialIndex = bufferIndices[subsetIndex];
887  const REALTYPE* rootPartials = gPartials[rootPartialIndex];
888  const REALTYPE* frequencies = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
889  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
890  int u = 0;
891  int v = 0;
892  for (int k = 0; k < kPatternCount; k++) {
893  for (int i = 0; i < kStateCount; i++) {
894  integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0];
895  u++;
896  v++;
897  }
898  v += P_PAD;
899  }
900  for (int l = 1; l < kCategoryCount; l++) {
901  u = 0;
902  for (int k = 0; k < kPatternCount; k++) {
903  for (int i = 0; i < kStateCount; i++) {
904  integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l];
905  u++;
906  v++;
907  }
908  v += P_PAD;
909  }
910  }
911  u = 0;
912  for (int k = 0; k < kPatternCount; k++) {
913  REALTYPE sum = 0.0;
914  for (int i = 0; i < kStateCount; i++) {
915  sum += ((REALTYPE)frequencies[i]) * integrationTmp[u];
916  u++;
917  }
918 
919  // TODO: allow only some subsets to have scale indices
920  if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
921  int cumulativeScalingFactorIndex;
922  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
923  cumulativeScalingFactorIndex = rootPartialIndex - kTipCount;
924  else
925  cumulativeScalingFactorIndex = scaleBufferIndices[subsetIndex];
926 
927  const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
928 
929  if (subsetIndex == 0) {
930  indexMaxScale[k] = 0;
931  maxScaleFactor[k] = cumulativeScaleFactors[k];
932  for (int j = 1; j < count; j++) {
933  REALTYPE tmpScaleFactor;
934  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
935  tmpScaleFactor = gScaleBuffers[bufferIndices[j] - kTipCount][k];
936  else
937  tmpScaleFactor = gScaleBuffers[scaleBufferIndices[j]][k];
938 
939  if (tmpScaleFactor > maxScaleFactor[k]) {
940  indexMaxScale[k] = j;
941  maxScaleFactor[k] = tmpScaleFactor;
942  }
943  }
944  }
945 
946  if (subsetIndex != indexMaxScale[k])
947  sum *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
948  }
949 
950  if (subsetIndex == 0) {
951  outLogLikelihoodsTmp[k] = sum;
952  } else if (subsetIndex == count - 1) {
953  REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sum;
954 
955  outLogLikelihoodsTmp[k] = log(tmpSum);
956  } else {
957  outLogLikelihoodsTmp[k] += sum;
958  }
959  }
960  }
961 
962  if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
963  for(int i=0; i<kPatternCount; i++)
964  outLogLikelihoodsTmp[i] += maxScaleFactor[i];
965  }
966 
967  *outSumLogLikelihood = 0.0;
968  for (int i = 0; i < kPatternCount; i++) {
969  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
970  }
971 
972  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
973  returnCode = BEAGLE_ERROR_FLOATING_POINT;
974 
975  return returnCode;
976 
977 }
978 
979 BEAGLE_CPU_TEMPLATE
980 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoods(const int bufferIndex,
981  const int categoryWeightsIndex,
982  const int stateFrequenciesIndex,
983  const int scalingFactorsIndex,
984  double* outSumLogLikelihood) {
985 
986  int returnCode = BEAGLE_SUCCESS;
987 
988  const REALTYPE* rootPartials = gPartials[bufferIndex];
989  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
990  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
991  int u = 0;
992  int v = 0;
993  for (int k = 0; k < kPatternCount; k++) {
994  for (int i = 0; i < kStateCount; i++) {
995  integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0];
996  u++;
997  v++;
998  }
999  v += P_PAD;
1000  }
1001  for (int l = 1; l < kCategoryCount; l++) {
1002  u = 0;
1003  for (int k = 0; k < kPatternCount; k++) {
1004  for (int i = 0; i < kStateCount; i++) {
1005  integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l];
1006  u++;
1007  v++;
1008  }
1009  v += P_PAD;
1010  }
1011  }
1012  u = 0;
1013  for (int k = 0; k < kPatternCount; k++) {
1014  REALTYPE sum = 0.0;
1015  for (int i = 0; i < kStateCount; i++) {
1016  sum += freqs[i] * integrationTmp[u];
1017  u++;
1018  }
1019 
1020  outLogLikelihoodsTmp[k] = log(sum);
1021  }
1022 
1023  if (scalingFactorsIndex >= 0) {
1024  const REALTYPE* cumulativeScaleFactors = gScaleBuffers[scalingFactorsIndex];
1025  for(int i=0; i<kPatternCount; i++) {
1026  outLogLikelihoodsTmp[i] += cumulativeScaleFactors[i];
1027  }
1028  }
1029 
1030  *outSumLogLikelihood = 0.0;
1031  for (int i = 0; i < kPatternCount; i++) {
1032  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1033  }
1034 
1035  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1036  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1037 
1038 
1039  // TODO: merge the three kPatternCount loops above into one
1040 
1041  return returnCode;
1042 }
1043 
1044 BEAGLE_CPU_TEMPLATE
1045 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateScaleFactors(const int* scalingIndices,
1046  int count,
1047  int cumulativeScalingIndex) {
1048  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1049  REALTYPE* cumulativeScaleBuffer = gScaleBuffers[0];
1050  for(int j=0; j<kPatternCount; j++)
1051  cumulativeScaleBuffer[j] = 0;
1052  for(int i=0; i<count; i++) {
1053  int sIndex = scalingIndices[i] - kTipCount;
1054  if (gActiveScalingFactors[sIndex]) {
1055  const signed short* scaleBuffer = gAutoScaleBuffers[sIndex];
1056  for(int j=0; j<kPatternCount; j++) {
1057  cumulativeScaleBuffer[j] += M_LN2 * scaleBuffer[j];
1058  }
1059  }
1060  }
1061 
1062  } else {
1063  REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex];
1064  for(int i=0; i<count; i++) {
1065  const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]];
1066  for(int j=0; j<kPatternCount; j++) {
1067  if (kFlags & BEAGLE_FLAG_SCALERS_LOG)
1068  cumulativeScaleBuffer[j] += scaleBuffer[j];
1069  else
1070  cumulativeScaleBuffer[j] += log(scaleBuffer[j]);
1071  }
1072  }
1073 
1074  if (DEBUGGING_OUTPUT) {
1075  fprintf(stderr,"Accumulating %d scale buffers into #%d\n",count,cumulativeScalingIndex);
1076  for(int j=0; j<kPatternCount; j++) {
1077  fprintf(stderr,"cumulativeScaleBuffer[%d] = %2.5e\n",j,cumulativeScaleBuffer[j]);
1078  }
1079  }
1080  }
1081 
1082  return BEAGLE_SUCCESS;
1083 }
1084 
1085 BEAGLE_CPU_TEMPLATE
1086 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::removeScaleFactors(const int* scalingIndices,
1087  int count,
1088  int cumulativeScalingIndex) {
1089  REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex];
1090  for(int i=0; i<count; i++) {
1091  const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]];
1092  for(int j=0; j<kPatternCount; j++) {
1093  if (kFlags & BEAGLE_FLAG_SCALERS_LOG)
1094  cumulativeScaleBuffer[j] -= scaleBuffer[j];
1095  else
1096  cumulativeScaleBuffer[j] -= log(scaleBuffer[j]);
1097  }
1098  }
1099 
1100  return BEAGLE_SUCCESS;
1101 }
1102 
1103 BEAGLE_CPU_TEMPLATE
1104 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1105  //memcpy(gScaleBuffers[cumulativeScalingIndex],zeros,sizeof(double) * kPatternCount);
1106 
1107  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1108  memset(gScaleBuffers[cumulativeScalingIndex], 0, sizeof(signed short) * kPaddedPatternCount);
1109  } else {
1110  memset(gScaleBuffers[cumulativeScalingIndex], 0, sizeof(REALTYPE) * kPaddedPatternCount);
1111  }
1112  return BEAGLE_SUCCESS;
1113 }
1114 
1115 BEAGLE_CPU_TEMPLATE
1116 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1117  int srcScalingIndex) {
1118  memcpy(gScaleBuffers[destScalingIndex],gScaleBuffers[srcScalingIndex],sizeof(REALTYPE) * kPatternCount);
1119 
1120  return BEAGLE_SUCCESS;
1121 }
1122 
1123 BEAGLE_CPU_TEMPLATE
1124  int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateEdgeLogLikelihoods(const int* parentBufferIndices,
1125  const int* childBufferIndices,
1126  const int* probabilityIndices,
1127  const int* firstDerivativeIndices,
1128  const int* secondDerivativeIndices,
1129  const int* categoryWeightsIndices,
1130  const int* stateFrequenciesIndices,
1131  const int* cumulativeScaleIndices,
1132  int count,
1133  double* outSumLogLikelihood,
1134  double* outSumFirstDerivative,
1135  double* outSumSecondDerivative) {
1136  // TODO: implement for count > 1
1137 
1138  if (count == 1) {
1139  int cumulativeScalingFactorIndex;
1140  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1141  cumulativeScalingFactorIndex = 0;
1142  } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1143  cumulativeScalingFactorIndex = kInternalPartialsBufferCount;
1144  int child1ScalingIndex = parentBufferIndices[0] - kTipCount;
1145  int child2ScalingIndex = childBufferIndices[0] - kTipCount;
1146  resetScaleFactors(cumulativeScalingFactorIndex);
1147  if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
1148  int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
1149  accumulateScaleFactors(scalingIndices, 2, cumulativeScalingFactorIndex);
1150  } else if (child1ScalingIndex >= 0) {
1151  int scalingIndices[1] = {child1ScalingIndex};
1152  accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex);
1153  } else if (child2ScalingIndex >= 0) {
1154  int scalingIndices[1] = {child2ScalingIndex};
1155  accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex);
1156  }
1157  } else {
1158  cumulativeScalingFactorIndex = cumulativeScaleIndices[0];
1159  }
1160  if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL)
1161  return calcEdgeLogLikelihoods(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
1162  categoryWeightsIndices[0], stateFrequenciesIndices[0], cumulativeScalingFactorIndex,
1163  outSumLogLikelihood);
1164  else if (secondDerivativeIndices == NULL)
1165  return calcEdgeLogLikelihoodsFirstDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
1166  firstDerivativeIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0],
1167  cumulativeScalingFactorIndex, outSumLogLikelihood, outSumFirstDerivative);
1168  else
1169  return calcEdgeLogLikelihoodsSecondDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
1170  firstDerivativeIndices[0], secondDerivativeIndices[0], categoryWeightsIndices[0],
1171  stateFrequenciesIndices[0], cumulativeScalingFactorIndex, outSumLogLikelihood,
1172  outSumFirstDerivative, outSumSecondDerivative);
1173  } else {
1174  if ((kFlags & BEAGLE_FLAG_SCALING_AUTO) || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
1175  fprintf(stderr,"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and auto/always scaling\n");
1176  }
1177 
1178  if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1179  return calcEdgeLogLikelihoodsMulti(parentBufferIndices, childBufferIndices, probabilityIndices,
1180  categoryWeightsIndices, stateFrequenciesIndices, cumulativeScaleIndices, count,
1181  outSumLogLikelihood);
1182  } else {
1183  fprintf(stderr,"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and derivatives\n");
1184  }
1185 
1186 
1187  }
1188 
1189 }
1190 
1191 BEAGLE_CPU_TEMPLATE
1192 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoods(const int parIndex,
1193  const int childIndex,
1194  const int probIndex,
1195  const int categoryWeightsIndex,
1196  const int stateFrequenciesIndex,
1197  const int scalingFactorsIndex,
1198  double* outSumLogLikelihood) {
1199 
1200  assert(parIndex >= kTipCount);
1201 
1202  int returnCode = BEAGLE_SUCCESS;
1203 
1204  const REALTYPE* partialsParent = gPartials[parIndex];
1205  const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
1206  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
1207  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
1208 
1209  memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1210 
1211 
1212  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
1213 
1214  const int* statesChild = gTipStates[childIndex];
1215  int v = 0; // Index for parent partials
1216 
1217  for(int l = 0; l < kCategoryCount; l++) {
1218  int u = 0; // Index in resulting product-partials (summed over categories)
1219  const REALTYPE weight = wt[l];
1220  for(int k = 0; k < kPatternCount; k++) {
1221 
1222  const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials,
1223  // so we can interchange the patterCount and categoryCount loop order?
1224  int w = l * kMatrixSize;
1225  for(int i = 0; i < kStateCount; i++) {
1226  integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1227  u++;
1228 
1229  w += kTransPaddedStateCount;
1230  }
1231  v += kPartialsPaddedStateCount;
1232  }
1233  }
1234 
1235  } else { // Integrate against a partial at the child
1236 
1237  const REALTYPE* partialsChild = gPartials[childIndex];
1238  int v = 0;
1239  int stateCountModFour = (kStateCount / 4) * 4;
1240 
1241  for(int l = 0; l < kCategoryCount; l++) {
1242  int u = 0;
1243  const REALTYPE weight = wt[l];
1244  for(int k = 0; k < kPatternCount; k++) {
1245  int w = l * kMatrixSize;
1246  const REALTYPE* partialsChildPtr = &partialsChild[v];
1247  for(int i = 0; i < kStateCount; i++) {
1248  double sumOverJA = 0.0, sumOverJB = 0.0;
1249  int j = 0;
1250  const REALTYPE* transMatrixPtr = &transMatrix[w];
1251  for (; j < stateCountModFour; j += 4) {
1252  sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0];
1253  sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1];
1254  sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2];
1255  sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3];
1256 
1257  }
1258  for (; j < kStateCount; j++) {
1259  sumOverJA += transMatrixPtr[j] * partialsChildPtr[j];
1260  }
1261  // for(int j = 0; j < kStateCount; j++) {
1262  // sumOverJ += transMatrix[w] * partialsChild[v + j];
1263  // w++;
1264  // }
1265  integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight;
1266  u++;
1267 
1268  w += kStateCount;
1269 
1270  // increment for the extra column at the end
1271  w += T_PAD;
1272  }
1273  v += kPartialsPaddedStateCount;
1274  }
1275  }
1276  }
1277 
1278  int u = 0;
1279  for(int k = 0; k < kPatternCount; k++) {
1280  REALTYPE sumOverI = 0.0;
1281  for(int i = 0; i < kStateCount; i++) {
1282  sumOverI += freqs[i] * integrationTmp[u];
1283  u++;
1284  }
1285 
1286  outLogLikelihoodsTmp[k] = log(sumOverI);
1287  }
1288 
1289 
1290  if (scalingFactorsIndex != BEAGLE_OP_NONE) {
1291  const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
1292  for(int k=0; k < kPatternCount; k++)
1293  outLogLikelihoodsTmp[k] += scalingFactors[k];
1294  }
1295 
1296  *outSumLogLikelihood = 0.0;
1297  for (int i = 0; i < kPatternCount; i++) {
1298  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1299  }
1300 
1301  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1302  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1303 
1304  return returnCode;
1305 }
1306 
1307 BEAGLE_CPU_TEMPLATE
1308 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsMulti(const int* parentBufferIndices,
1309  const int* childBufferIndices,
1310  const int* probabilityIndices,
1311  const int* categoryWeightsIndices,
1312  const int* stateFrequenciesIndices,
1313  const int* scalingFactorsIndices,
1314  int count,
1315  double* outSumLogLikelihood) {
1316 
1317  std::vector<int> indexMaxScale(kPatternCount);
1318  std::vector<REALTYPE> maxScaleFactor(kPatternCount);
1319 
1320  int returnCode = BEAGLE_SUCCESS;
1321 
1322  for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
1323  const REALTYPE* partialsParent = gPartials[parentBufferIndices[subsetIndex]];
1324  const REALTYPE* transMatrix = gTransitionMatrices[probabilityIndices[subsetIndex]];
1325  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
1326  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
1327  int childIndex = childBufferIndices[subsetIndex];
1328 
1329  memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1330 
1331  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
1332 
1333  const int* statesChild = gTipStates[childIndex];
1334  int v = 0; // Index for parent partials
1335 
1336  for(int l = 0; l < kCategoryCount; l++) {
1337  int u = 0; // Index in resulting product-partials (summed over categories)
1338  const REALTYPE weight = wt[l];
1339  for(int k = 0; k < kPatternCount; k++) {
1340 
1341  const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials,
1342  // so we can interchange the patterCount and categoryCount loop order?
1343  int w = l * kMatrixSize;
1344  for(int i = 0; i < kStateCount; i++) {
1345  integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1346  u++;
1347 
1348  w += kTransPaddedStateCount;
1349  }
1350  v += kPartialsPaddedStateCount;
1351  }
1352  }
1353  } else {
1354  const REALTYPE* partialsChild = gPartials[childIndex];
1355  int v = 0;
1356  int stateCountModFour = (kStateCount / 4) * 4;
1357 
1358  for(int l = 0; l < kCategoryCount; l++) {
1359  int u = 0;
1360  const REALTYPE weight = wt[l];
1361  for(int k = 0; k < kPatternCount; k++) {
1362  int w = l * kMatrixSize;
1363  const REALTYPE* partialsChildPtr = &partialsChild[v];
1364  for(int i = 0; i < kStateCount; i++) {
1365  double sumOverJA = 0.0, sumOverJB = 0.0;
1366  int j = 0;
1367  const REALTYPE* transMatrixPtr = &transMatrix[w];
1368  for (; j < stateCountModFour; j += 4) {
1369  sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0];
1370  sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1];
1371  sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2];
1372  sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3];
1373 
1374  }
1375  for (; j < kStateCount; j++) {
1376  sumOverJA += transMatrixPtr[j] * partialsChildPtr[j];
1377  }
1378 // for(int j = 0; j < kStateCount; j++) {
1379 // sumOverJ += transMatrix[w] * partialsChild[v + j];
1380 // w++;
1381 // }
1382  integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight;
1383  u++;
1384 
1385  w += kStateCount;
1386 
1387  // increment for the extra column at the end
1388  w += T_PAD;
1389  }
1390  v += kPartialsPaddedStateCount;
1391  }
1392  }
1393 
1394  }
1395  int u = 0;
1396  for(int k = 0; k < kPatternCount; k++) {
1397  REALTYPE sumOverI = 0.0;
1398  for(int i = 0; i < kStateCount; i++) {
1399  sumOverI += freqs[i] * integrationTmp[u];
1400  u++;
1401  }
1402 
1403  if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) {
1404  int cumulativeScalingFactorIndex;
1405  cumulativeScalingFactorIndex = scalingFactorsIndices[subsetIndex];
1406 
1407  const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
1408 
1409  if (subsetIndex == 0) {
1410  indexMaxScale[k] = 0;
1411  maxScaleFactor[k] = cumulativeScaleFactors[k];
1412  for (int j = 1; j < count; j++) {
1413  REALTYPE tmpScaleFactor;
1414  tmpScaleFactor = gScaleBuffers[scalingFactorsIndices[j]][k];
1415 
1416  if (tmpScaleFactor > maxScaleFactor[k]) {
1417  indexMaxScale[k] = j;
1418  maxScaleFactor[k] = tmpScaleFactor;
1419  }
1420  }
1421  }
1422 
1423  if (subsetIndex != indexMaxScale[k])
1424  sumOverI *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
1425  }
1426 
1427 
1428 
1429  if (subsetIndex == 0) {
1430  outLogLikelihoodsTmp[k] = sumOverI;
1431  } else if (subsetIndex == count - 1) {
1432  REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sumOverI;
1433 
1434  outLogLikelihoodsTmp[k] = log(tmpSum);
1435  } else {
1436  outLogLikelihoodsTmp[k] += sumOverI;
1437  }
1438 
1439  }
1440 
1441  }
1442 
1443  if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) {
1444  for(int i=0; i<kPatternCount; i++)
1445  outLogLikelihoodsTmp[i] += maxScaleFactor[i];
1446  }
1447 
1448 
1449  *outSumLogLikelihood = 0.0;
1450  for (int i = 0; i < kPatternCount; i++) {
1451  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1452  }
1453 
1454  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1455  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1456 
1457  return returnCode;
1458 }
1459 
1460 
1461 BEAGLE_CPU_TEMPLATE
1462 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsFirstDeriv(const int parIndex,
1463  const int childIndex,
1464  const int probIndex,
1465  const int firstDerivativeIndex,
1466  const int categoryWeightsIndex,
1467  const int stateFrequenciesIndex,
1468  const int scalingFactorsIndex,
1469  double* outSumLogLikelihood,
1470  double* outSumFirstDerivative) {
1471 
1472  assert(parIndex >= kTipCount);
1473 
1474  int returnCode = BEAGLE_SUCCESS;
1475 
1476  const REALTYPE* partialsParent = gPartials[parIndex];
1477  const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
1478  const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
1479  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
1480  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
1481 
1482 
1483  memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1484  memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1485 
1486  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
1487 
1488  const int* statesChild = gTipStates[childIndex];
1489  int v = 0; // Index for parent partials
1490 
1491  for(int l = 0; l < kCategoryCount; l++) {
1492  int u = 0; // Index in resulting product-partials (summed over categories)
1493  const REALTYPE weight = wt[l];
1494  for(int k = 0; k < kPatternCount; k++) {
1495 
1496  const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials,
1497  // so we can interchange the patterCount and categoryCount loop order?
1498  int w = l * kMatrixSize;
1499  for(int i = 0; i < kStateCount; i++) {
1500  integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1501  firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
1502  u++;
1503 
1504  w += kTransPaddedStateCount;
1505  }
1506  v += kPartialsPaddedStateCount;
1507  }
1508  }
1509 
1510  } else { // Integrate against a partial at the child
1511 
1512  const REALTYPE* partialsChild = gPartials[childIndex];
1513  int v = 0;
1514 
1515  for(int l = 0; l < kCategoryCount; l++) {
1516  int u = 0;
1517  const REALTYPE weight = wt[l];
1518  for(int k = 0; k < kPatternCount; k++) {
1519  int w = l * kMatrixSize;
1520  for(int i = 0; i < kStateCount; i++) {
1521  double sumOverJ = 0.0;
1522  double sumOverJD1 = 0.0;
1523  for(int j = 0; j < kStateCount; j++) {
1524  sumOverJ += transMatrix[w] * partialsChild[v + j];
1525  sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j];
1526  w++;
1527  }
1528 
1529  // increment for the extra column at the end
1530  w += T_PAD;
1531 
1532  integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight;
1533  firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight;
1534  u++;
1535  }
1536  v += kPartialsPaddedStateCount;
1537  }
1538  }
1539  }
1540 
1541  int u = 0;
1542  for(int k = 0; k < kPatternCount; k++) {
1543  REALTYPE sumOverI = 0.0;
1544  REALTYPE sumOverID1 = 0.0;
1545  for(int i = 0; i < kStateCount; i++) {
1546  sumOverI += freqs[i] * integrationTmp[u];
1547  sumOverID1 += freqs[i] * firstDerivTmp[u];
1548  u++;
1549  }
1550 
1551  outLogLikelihoodsTmp[k] = log(sumOverI);
1552  outFirstDerivativesTmp[k] = sumOverID1 / sumOverI;
1553  }
1554 
1555 
1556  if (scalingFactorsIndex != BEAGLE_OP_NONE) {
1557  const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
1558  for(int k=0; k < kPatternCount; k++)
1559  outLogLikelihoodsTmp[k] += scalingFactors[k];
1560  }
1561 
1562  *outSumLogLikelihood = 0.0;
1563  *outSumFirstDerivative = 0.0;
1564  for (int i = 0; i < kPatternCount; i++) {
1565  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1566 
1567  *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i];
1568  }
1569 
1570  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1571  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1572 
1573  return returnCode;
1574 }
1575 
1576 BEAGLE_CPU_TEMPLATE
1577 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsSecondDeriv(const int parIndex,
1578  const int childIndex,
1579  const int probIndex,
1580  const int firstDerivativeIndex,
1581  const int secondDerivativeIndex,
1582  const int categoryWeightsIndex,
1583  const int stateFrequenciesIndex,
1584  const int scalingFactorsIndex,
1585  double* outSumLogLikelihood,
1586  double* outSumFirstDerivative,
1587  double* outSumSecondDerivative) {
1588 
1589  assert(parIndex >= kTipCount);
1590 
1591  int returnCode = BEAGLE_SUCCESS;
1592 
1593  const REALTYPE* partialsParent = gPartials[parIndex];
1594  const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
1595  const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
1596  const REALTYPE* secondDerivMatrix = gTransitionMatrices[secondDerivativeIndex];
1597  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
1598  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
1599 
1600 
1601  memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1602  memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1603  memset(secondDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1604 
1605  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
1606 
1607  const int* statesChild = gTipStates[childIndex];
1608  int v = 0; // Index for parent partials
1609 
1610  for(int l = 0; l < kCategoryCount; l++) {
1611  int u = 0; // Index in resulting product-partials (summed over categories)
1612  const REALTYPE weight = wt[l];
1613  for(int k = 0; k < kPatternCount; k++) {
1614 
1615  const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials,
1616  // so we can interchange the patterCount and categoryCount loop order?
1617  int w = l * kMatrixSize;
1618  for(int i = 0; i < kStateCount; i++) {
1619  integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1620  firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
1621  secondDerivTmp[u] += secondDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
1622  u++;
1623 
1624  w += kTransPaddedStateCount;
1625  }
1626  v += kPartialsPaddedStateCount;
1627  }
1628  }
1629 
1630  } else { // Integrate against a partial at the child
1631 
1632  const REALTYPE* partialsChild = gPartials[childIndex];
1633  int v = 0;
1634 
1635  for(int l = 0; l < kCategoryCount; l++) {
1636  int u = 0;
1637  const REALTYPE weight = wt[l];
1638  for(int k = 0; k < kPatternCount; k++) {
1639  int w = l * kMatrixSize;
1640  for(int i = 0; i < kStateCount; i++) {
1641  double sumOverJ = 0.0;
1642  double sumOverJD1 = 0.0;
1643  double sumOverJD2 = 0.0;
1644  for(int j = 0; j < kStateCount; j++) {
1645  sumOverJ += transMatrix[w] * partialsChild[v + j];
1646  sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j];
1647  sumOverJD2 += secondDerivMatrix[w] * partialsChild[v + j];
1648  w++;
1649  }
1650 
1651  // increment for the extra column at the end
1652  w += T_PAD;
1653 
1654  integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight;
1655  firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight;
1656  secondDerivTmp[u] += sumOverJD2 * partialsParent[v + i] * weight;
1657  u++;
1658  }
1659  v += kPartialsPaddedStateCount;
1660  }
1661  }
1662  }
1663 
1664  int u = 0;
1665  for(int k = 0; k < kPatternCount; k++) {
1666  REALTYPE sumOverI = 0.0;
1667  REALTYPE sumOverID1 = 0.0;
1668  REALTYPE sumOverID2 = 0.0;
1669  for(int i = 0; i < kStateCount; i++) {
1670  sumOverI += freqs[i] * integrationTmp[u];
1671  sumOverID1 += freqs[i] * firstDerivTmp[u];
1672  sumOverID2 += freqs[i] * secondDerivTmp[u];
1673  u++;
1674  }
1675 
1676  outLogLikelihoodsTmp[k] = log(sumOverI);
1677  outFirstDerivativesTmp[k] = sumOverID1 / sumOverI;
1678  outSecondDerivativesTmp[k] = sumOverID2 / sumOverI - outFirstDerivativesTmp[k] * outFirstDerivativesTmp[k];
1679  }
1680 
1681 
1682  if (scalingFactorsIndex != BEAGLE_OP_NONE) {
1683  const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
1684  for(int k=0; k < kPatternCount; k++)
1685  outLogLikelihoodsTmp[k] += scalingFactors[k];
1686  }
1687 
1688  *outSumLogLikelihood = 0.0;
1689  *outSumFirstDerivative = 0.0;
1690  *outSumSecondDerivative = 0.0;
1691  for (int i = 0; i < kPatternCount; i++) {
1692  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1693 
1694  *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i];
1695 
1696  *outSumSecondDerivative += outSecondDerivativesTmp[i] * gPatternWeights[i];
1697  }
1698 
1699  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1700  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1701 
1702  return returnCode;
1703 }
1704 
1705 
1706 
1707 BEAGLE_CPU_TEMPLATE
1708 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::block(void) {
1709  // Do nothing.
1710  return BEAGLE_SUCCESS;
1711 }
1712 
1714 // private methods
1715 
1716 /*
1717  * Re-scales the partial likelihoods such that the largest is one.
1718  */
1719 BEAGLE_CPU_TEMPLATE
1720 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::rescalePartials(REALTYPE* destP,
1721  REALTYPE* scaleFactors,
1722  REALTYPE* cumulativeScaleFactors,
1723  const int fillWithOnes) {
1724  if (DEBUGGING_OUTPUT) {
1725  std::cerr << "destP (before rescale): \n";// << destP << "\n";
1726  for(int i=0; i<kPartialsSize; i++)
1727  fprintf(stderr,"destP[%d] = %.5f\n",i,destP[i]);
1728  }
1729 
1730  // TODO None of the code below has been optimized.
1731  for (int k = 0; k < kPatternCount; k++) {
1732  REALTYPE max = 0;
1733  const int patternOffset = k * kPartialsPaddedStateCount;
1734  for (int l = 0; l < kCategoryCount; l++) {
1735  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1736  for (int i = 0; i < kStateCount; i++) {
1737  if(destP[offset] > max)
1738  max = destP[offset];
1739  offset++;
1740  }
1741  }
1742 
1743  if (max == 0)
1744  max = 1.0;
1745 
1746  REALTYPE oneOverMax = REALTYPE(1.0) / max;
1747  for (int l = 0; l < kCategoryCount; l++) {
1748  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1749  for (int i = 0; i < kStateCount; i++)
1750  destP[offset++] *= oneOverMax;
1751  }
1752 
1753  if (kFlags & BEAGLE_FLAG_SCALERS_LOG) {
1754  REALTYPE logMax = log(max);
1755  scaleFactors[k] = logMax;
1756  if( cumulativeScaleFactors != NULL )
1757  cumulativeScaleFactors[k] += logMax;
1758  } else {
1759  scaleFactors[k] = max;
1760  if( cumulativeScaleFactors != NULL )
1761  cumulativeScaleFactors[k] += log(max);
1762  }
1763  }
1764  if (DEBUGGING_OUTPUT) {
1765  for(int i=0; i<kPatternCount; i++)
1766  fprintf(stderr,"new scaleFactor[%d] = %.5f\n",i,scaleFactors[i]);
1767  }
1768 }
1769 
1770 BEAGLE_CPU_TEMPLATE
1771 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::autoRescalePartials(REALTYPE* destP,
1772  signed short* scaleFactors) {
1773 
1774 
1775  for (int k = 0; k < kPatternCount; k++) {
1776  REALTYPE max = 0;
1777  const int patternOffset = k * kPartialsPaddedStateCount;
1778  for (int l = 0; l < kCategoryCount; l++) {
1779  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1780  for (int i = 0; i < kStateCount; i++) {
1781  if(destP[offset] > max)
1782  max = destP[offset];
1783  offset++;
1784  }
1785  }
1786 
1787  int expMax;
1788  frexp(max, &expMax);
1789  scaleFactors[k] = expMax;
1790 
1791  if (expMax != 0) {
1792  for (int l = 0; l < kCategoryCount; l++) {
1793  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1794  for (int i = 0; i < kStateCount; i++)
1795  destP[offset++] *= pow(2.0, -expMax);
1796  }
1797  }
1798  }
1799 }
1800 
1801 /*
1802  * Calculates partial likelihoods at a node when both children have states.
1803  */
1804 BEAGLE_CPU_TEMPLATE
1805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
1806  const int* states1,
1807  const REALTYPE* matrices1,
1808  const int* states2,
1809  const REALTYPE* matrices2) {
1810 
1811 #pragma omp parallel for num_threads(kCategoryCount)
1812  for (int l = 0; l < kCategoryCount; l++) {
1813  int v = l*kPartialsPaddedStateCount*kPatternCount;
1814  for (int k = 0; k < kPatternCount; k++) {
1815  const int state1 = states1[k];
1816  const int state2 = states2[k];
1817  if (DEBUGGING_OUTPUT) {
1818  std::cerr << "calcStatesStates s1 = " << state1 << '\n';
1819  std::cerr << "calcStatesStates s2 = " << state2 << '\n';
1820  }
1821  int w = l * kMatrixSize;
1822  for (int i = 0; i < kStateCount; i++) {
1823  destP[v] = matrices1[w + state1] * matrices2[w + state2];
1824  v++;
1825 
1826  w += kTransPaddedStateCount;
1827  }
1828  v += P_PAD;
1829  }
1830  }
1831 }
1832 
1833 BEAGLE_CPU_TEMPLATE
1834 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
1835  const int* child1States,
1836  const REALTYPE* child1TransMat,
1837  const int* child2States,
1838  const REALTYPE* child2TransMat,
1839  const REALTYPE* scaleFactors) {
1840 #pragma omp parallel for num_threads(kCategoryCount)
1841  for (int l = 0; l < kCategoryCount; l++) {
1842  int v = l*kPartialsPaddedStateCount*kPatternCount;
1843  for (int k = 0; k < kPatternCount; k++) {
1844  const int state1 = child1States[k];
1845  const int state2 = child2States[k];
1846  int w = l * kMatrixSize;
1847  REALTYPE scaleFactor = scaleFactors[k];
1848  for (int i = 0; i < kStateCount; i++) {
1849  destP[v] = child1TransMat[w + state1] *
1850  child2TransMat[w + state2] / scaleFactor;
1851  v++;
1852 
1853  w += kTransPaddedStateCount;
1854  }
1855  v += P_PAD;
1856  }
1857  }
1858 }
1859 
1860 /*
1861  * Calculates partial likelihoods at a node when one child has states and one has partials.
1862  */
1863 BEAGLE_CPU_TEMPLATE
1864 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
1865  const int* states1,
1783  offset++;
1784  }
1785  }
1786 
1787  int expMax;
1788  frexp(max, &expMax);
1789  scaleFactors[k] = expMax;
1790 
1791  if (expMax != 0) {
1792  for (int l = 0; l < kCategoryCount; l++) {
1793  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1794  for (int i = 0; i < kStateCount; i++)
1795  destP[offset++] *= pow(2.0, -expMax);
1796  }
1797  }
1798  }
1799 }
1800 
1801 /*
1802  * Calculates partial likelihoods at a node when both children have states.
1803  */
1804 BEAGLE_CPU_TEMPLATE
1805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
1806  const int* states1,
1807  const REALTYPE* matrices1,
1808  const int* states2,
1809  const REALTYPE* matrices2) {
1810 
1811 #pragma omp parallel for num_threads(kCategoryCount)
1812  for (int l = 0; l < kCategoryCount; l++) {
1813  int v = l*kPartialsPaddedStateCount*kPatternCount;
1814  for (int k = 0; k < kPatternCount; k++) {
1815  const int state1 = states1[k];
1816  const int state2 = states2[k];
1817  if (DEBUGGING_OUTPUT) {
1818  std::cerr << "calcStatesStates s1 = " << state1 << '\n';
1819  std::cerr << "calcStatesStates s2 = " << state2 << '\n';
1820  }
1821  int w = l * kMatrixSize;
1822  for (int i = 0; i < kStateCount; i++) {
1823  destP[v] = matrices1[w + state1] * matrices2[w + state2];
1824  v++;
1825 
1826  w += kTransPaddedStateCount;
1827  }
1828  v += P_PAD;
1829  }
1830  }
1831 }
1832 
1833 BEAGLE_CPU_TEMPLATE
1834 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
1835  const int* child1States,
1836  const REALTYPE* child1TransMat,
1837  const int* child2States,
1838  const REALTYPE* child2TransMat,
1839  const REALTYPE* scaleFactors) {
1840 #pragma omp parallel for num_threads(kCategoryCount)
1841  for (int l = 0; l < kCategoryCount; l++) {
1842  int v = l*kPartialsPaddedStateCount*kPatternCount;
1843  for (int k = 0; k < kPatternCount; k++) {
1844  const int state1 = child1States[k];
1845  const int state2 = child2States[k];
1846  int w = l * kMatrixSize;
1847  REALTYPE scaleFactor = scaleFactors[k];
1848  for (int i = 0; i < kStateCount; i++) {
1849  destP[v] = child1TransMat[w + state1] *
1850  child2TransMat[w + state2] / scaleFactor;
1851  v++;
1852 
1853  w += kTransPaddedStateCount;
1854  }
1855  v += P_PAD;
1856  }
1857  }
1858 }
1859 
1860 /*
1861  * Calculates partial likelihoods at a node when one child has states and one has partials.
1862  */
1863 BEAGLE_CPU_TEMPLATE
1864 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
1865  const int* states1,
1783  offset++;
1784  }
1785  }
1786 
1787  int expMax;
1788  frexp(max, &expMax);
1789  scaleFactors[k] = expMax;
1790 
1791  if (expMax != 0) {
1792  for (int l = 0; l < kCategoryCount; l++) {
1793  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1794  for (int i = 0; i < kStateCount; i++)
1795  destP[offset++] *= pow(2.0, -expMax);
1796  }
1797  }
1798  }
1799 }
1800 
1801 /*
1802  * Calculates partial likelihoods at a node when both children have states.
1803  */
1804 BEAGLE_CPU_TEMPLATE
1805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
1806  const int* states1,
1807  const REALTYPE* matrices1,
1808  const int* states2,
1809  const REALTYPE* matrices2) {
1810 
1811 #pragma omp parallel for num_threads(kCategoryCount)
1812  for (int l = 0; l < kCategoryCount; l++) {
1813  int v = l*kPartialsPaddedStateCount*kPatternCount;
1814  for (int k = 0; k < kPatternCount; k++) {
1815  const int state1 = states1[k];
1816  const int state2 = states2[k];
1817  if (DEBUGGING_OUTPUT) {
1818  std::cerr << "calcStatesStates s1 = " << state1 << '\n';
1819  std::cerr << "calcStatesStates s2 = " << state2 << '\n';
1820  }
1821  int w = l * kMatrixSize;
1822  for (int i = 0; i < kStateCount; i++) {
1823  destP[v] = matrices1[w + state1] * matrices2[w + state2];
1824  v++;
1825 
1826  w += kTransPaddedStateCount;
1827  }
1828  v += P_PAD;
1829  }
1830  }
1831 }
1832 
1833 BEAGLE_CPU_TEMPLATE
1834 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
1835  const int* child1States,
1836  const REALTYPE* child1TransMat,
1837  const int* child2States,
1838  const REALTYPE* child2TransMat,
1839  const REALTYPE* scaleFactors) {
1840 #pragma omp parallel for num_threads(kCategoryCount)
1841  for (int l = 0; l < kCategoryCount; l++) {
1842  int v = l*kPartialsPaddedStateCount*kPatternCount;
1843  for (int k = 0; k < kPatternCount; k++) {
1844  const int state1 = child1States[k];
1845  const int state2 = child2States[k];
1846  int w = l * kMatrixSize;
1847  REALTYPE scaleFactor = scaleFactors[k];
1848  for (int i = 0; i < kStateCount; i++) {
1849  destP[v] = child1TransMat[w + state1] *
1850  child2TransMat[w + state2] / scaleFactor;
1851  v++;
1852 
1853  w += kTransPaddedStateCount;
1854  }
1855  v += P_PAD;
1856  }
1857  }
1858 }
1859 
1860 /*
1861  * Calculates partial likelihoods at a node when one child has states and one has partials.
1862  */
1863 BEAGLE_CPU_TEMPLATE
1864 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
1865  const int* states1,
1783  offset++;
1784  }
1785  }
1786 
1787  int expMax;
1788  frexp(max, &expMax);
1789  scaleFactors[k] = expMax;
1790 
1791  if (expMax != 0) {
1792  for (int l = 0; l < kCategoryCount; l++) {
1793  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1794  for (int i = 0; i < kStateCount; i++)
1795  destP[offset++] *= pow(2.0, -expMax);
1796  }
1797  }
1798  }
1799 }
1800 
1801 /*
1802  * Calculates partial likelihoods at a node when both children have states.
1803  */
1804 BEAGLE_CPU_TEMPLATE
1805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
1806  const int* states1,
1807  const REALTYPE* matrices1,
1808  const int* states2,
1809  const REALTYPE* matrices2) {
1810 
1811 #pragma omp parallel for num_threads(kCategoryCount)
1812  for (int l = 0; l < kCategoryCount; l++) {
1813  int v = l*kPartialsPaddedStateCount*kPatternCount;
1814  for (int k = 0; k < kPatternCount; k++) {
1815  const int state1 = states1[k];
1816  const int state2 = states2[k];
1817  if (DEBUGGING_OUTPUT) {
1818  std::cerr << "calcStatesStates s1 = " << state1 << '\n';
1819  std::cerr << "calcStatesStates s2 = " << state2 << '\n';
1820  }
1821  int w = l * kMatrixSize;
1822  for (int i = 0; i < kStateCount; i++) {
1823  destP[v] = matrices1[w + state1] * matrices2[w + state2];
1824  v++;
1825 
1826  w += kTransPaddedStateCount;
1827  }
1828  v += P_PAD;
1829  }
1830  }
1831 }
1832 
1833 BEAGLE_CPU_TEMPLATE
1834 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
1835  const int* child1States,
1836  const REALTYPE* child1TransMat,
1837  const int* child2States,
1838  const REALTYPE* child2TransMat,
1839  const REALTYPE* scaleFactors) {
1840 #pragma omp parallel for num_threads(kCategoryCount)
1841  for (int l = 0; l < kCategoryCount; l++) {
1842  int v = l*kPartialsPaddedStateCount*kPatternCount;
1843  for (int k = 0; k < kPatternCount; k++) {
1844  const int state1 = child1States[k];
1845  const int state2 = child2States[k];
1846  int w = l * kMatrixSize;
1847  REALTYPE scaleFactor = scaleFactors[k];
1848  for (int i = 0; i < kStateCount; i++) {
1849  destP[v] = child1TransMat[w + state1] *
1850  child2TransMat[w + state2] / scaleFactor;
1851  v++;
1852 
1853  w += kTransPaddedStateCount;
1854  }
1855  v += P_PAD;
1856  }
1857  }
1858 }
1859 
1860 /*
1861  * Calculates partial likelihoods at a node when one child has states and one has partials.
1862  */
1863 BEAGLE_CPU_TEMPLATE
1864 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
1865  const int* states1,
1783  offset++;
1784  }
1785  }
1786 
1787  int expMax;
1788  frexp(max, &expMax);
1789  scaleFactors[k] = expMax;
1790 
1791  if (expMax != 0) {
1792  for (int l = 0; l < kCategoryCount; l++) {
1793  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1794  for (int i = 0; i < kStateCount; i++)
1795  destP[offset++] *= pow(2.0, -expMax);
1796  }
1797  }
1798  }
1799 }
1800 
1801 /*
1802  * Calculates partial likelihoods at a node when both children have states.
1803  */
1804 BEAGLE_CPU_TEMPLATE
1805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
1806  const int* states1,
1807  const REALTYPE* matrices1,
1808  const int* states2,
1809  const REALTYPE* matrices2) {
1810 
1811 #pragma omp parallel for num_threads(kCategoryCount)
1812  for (int l = 0; l < kCategoryCount; l++) {
1813  int v = l*kPartialsPaddedStateCount*kPatternCount;
1814  for (int k = 0; k < kPatternCount; k++) {
1815  const int state1 = states1[k];
1816  const int state2 = states2[k];
1817  if (DEBUGGING_OUTPUT) {
1818  std::cerr << "calcStatesStates s1 = " << state1 << '\n';
1819  std::cerr << "calcStatesStates s2 = " << state2 << '\n';
1820  }
1821  int w = l * kMatrixSize;
1822  for (int i = 0; i < kStateCount; i++) {
1823  destP[v] = matrices1[w + state1] * matrices2[w + state2];
1824  v++;
1825 
1826  w += kTransPaddedStateCount;
1827  }
1828  v += P_PAD;
1829  }
1830  }
1831 }
1832 
1833 BEAGLE_CPU_TEMPLATE
1834 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
1835  const int* child1States,
1836  const REALTYPE* child1TransMat,
1837  const int* child2States,
1838  const REALTYPE* child2TransMat,
1839  const REALTYPE* scaleFactors) {
1840 #pragma omp parallel for num_threads(kCategoryCount)
1841  for (int l = 0; l < kCategoryCount; l++) {
1842  int v = l*kPartialsPaddedStateCount*kPatternCount;
1843  for (int k = 0; k < kPatternCount; k++) {
1844  const int state1 = child1States[k];
1845  const int state2 = child2States[k];
1846  int w = l * kMatrixSize;
1847  REALTYPE scaleFactor = scaleFactors[k];
1848  for (int i = 0; i < kStateCount; i++) {
1849  destP[v] = child1TransMat[w + state1] *
1850  child2TransMat[w + state2] / scaleFactor;
1851  v++;
1852 
1853  w += kTransPaddedStateCount;
1854  }
1855  v += P_PAD;
1856  }
1857  }
1858 }
1859 
1860 /*
1861  * Calculates partial likelihoods at a node when one child has states and one has partials.
1862  */
1863 BEAGLE_CPU_TEMPLATE
1864 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
1865  const int* states1,
1783  offset++;
1784  }
1785  }
1786 
1787  int expMax;
1788  frexp(max, &expMax);
1789  scaleFactors[k] = expMax;
1790 
1791  if (expMax != 0) {
1792  for (int l = 0; l < kCategoryCount; l++) {
1793  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1794  for (int i = 0; i < kStateCount; i++)
1795  destP[offset++] *= pow(2.0, -expMax);
1796  }
1797  }
1798  }
1799 }
1800 
1801 /*
1802  * Calculates partial likelihoods at a node when both children have states.
1803  */
1804 BEAGLE_CPU_TEMPLATE
1805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
1806  const int* states1,
1807  const REALTYPE* matrices1,
1808  const int* states2,
1809  const REALTYPE* matrices2) {
1810 
1811 #pragma omp parallel for num_threads(kCategoryCount)
1812  for (int l = 0; l < kCatego