/* likelihoods.c

   Written by Frederic Bois
   22 June 2014

   Copyright (c) 2014 Frederic Bois.

   This code is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   See the GNU General Public License at <http://www.gnu.org/licenses/>

   -- Revisions -----
     Logfile:  %F%
    Revision:  %I%
        Date:  %G%
     Modtime:  %U%
      Author:  @a
   -- SCCS  ---------

   Define various data likelihood functions.
*/


/* ----------------------------------------------------------------------------
   Inclusions
*/

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "lexerr.h"
#include "matrices.h"
#include "random.h"
#include "likelihoods.h"


/* ----------------------------------------------------------------------------
   Loglikelihood_diff

   Difference in log-likelihood for a node, given a change in its parenthood.
   Inputs:
     parent : the ID # of the parent node
     child: the ID # of the node for which we compute the likelihood
     diff: -1 for deletion, +1 for addition, of the parent-child link
     pData : a data structure
   Output:
     logLdiff: a pointer to the difference of the loglikelihood after and
               before change.

   Beware: this changes globally the number and index of the parents of "child".
*/
void Loglikelihood_diff (int parent, int child, int diff,
                         double **pData, double *logLdiff)
{
  BOOL bFound;
  int  i, iTmp;

  // change parenthood arrays; beware of indices...
  if (diff < 0) {
    // remove parent
    if (parent == index_parents[child][nParents[child]-1]) {
      // last, just decrease the count of parents
      nParents[child] -= 1;
    }
    else {
      /* not last: first exchange with the last parent
         then decrease the count of parents. Note that this shuffles
         the order of parents in list */
      bFound = FALSE;
      i = -1;
      do {
        i++;
        bFound = (parent == index_parents[child][i]);
      } while (!bFound);
      iTmp = index_parents[child][i];
      index_parents[child][i] = index_parents[child][nParents[child]-1];
      index_parents[child][nParents[child]-1] = iTmp;
      nParents[child] -= 1;
    }
  }
  else {
    // add parent
    index_parents[child][nParents[child]] = parent;
    nParents[child] += 1;
  }

  // now compute
  if (bBN) {
    if (bZellner)
      *logLdiff = ZLoglikelihood_node (child, pData) - 
                  current_ll_node[child];
    else {
      if (bDirichlet)
        *logLdiff = DLoglikelihood_node (child, pData) - 
                    current_ll_node[child];
      else
        *logLdiff = GLoglikelihood_node (child, pData) - 
                    current_ll_node[child];
    }
  }
  else { // bDBN, hopefully
    if (bZellner)
      *logLdiff = ZLoglikelihood_node_DBN (child, pData) -
                  current_ll_node[child];
    else {
      if (bDirichlet)
        *logLdiff = DLoglikelihood_node_DBN (child, pData) -
                    current_ll_node[child];
      else
        *logLdiff = GLoglikelihood_node_DBN (child, pData) -
                    current_ll_node[child];
    }
  }

} /* Loglikelihood_diff */


/* ----------------------------------------------------------------------------
   Loglikelihood_full

   Full log-likelihood for a given graph (specified by its adjacency matrix).
   The arrays nParents and index_parents are supposed to be meaningfully
   initialized using the starting or current adjacency matrix.
   The likelihood of "Control" nodes, for which bAllowed_parents is FALSE,
   is nor computed (those nodes are jumped over). However, they are still used
   as parents of their children and then take as values the data they have been
   given in the inut file.
   Inputs:
     N : number of nodes
     adjacency : a pointer to the adjacency matrix
     pData : a data structure
   The adjacency matrix on which it operates is global and actually not
   used here, but below (in the individual nodes likelihood calculations).
*/
double Loglikelihood_full (int N, double **pData)
{
  double cumLL = 0;
  int    i;

  if (!current_ll_node)
    current_ll_node = InitdVector(N);

  // sum up the log-likelihoods of each node
  for (i = 0; i < N; i++) {
    if (bAllowed_parents[i]) {

      if (bBN) {
        if (bZellner)
          current_ll_node[i] = ZLoglikelihood_node (i, pData);
        else {
          if (bDirichlet)
            current_ll_node[i] = DLoglikelihood_node (i, pData);
          else
            current_ll_node[i] = GLoglikelihood_node (i, pData);
        }
      }
      else { // bDBN, hopefully
        if (bZellner)
          current_ll_node[i] = ZLoglikelihood_node_DBN (i, pData);
        else {
          if (bDirichlet)
            current_ll_node[i] = DLoglikelihood_node_DBN (i, pData);
          else
            current_ll_node[i] = GLoglikelihood_node_DBN (i, pData);
        }
      }

      cumLL += current_ll_node[i];
    }
  }

  return (cumLL);

} /* Loglikelihood_full */


/* ----------------------------------------------------------------------------
   GLoglikelihood_node

   Computes the log-likelihood of the data for one node, given a normal-gamma
   prior and the values of its parents (taken as regressors X)
   The prior hyperparameter are (see Bernardo's book appendix)
    . mean of regression parameters : all null
    . precision matrix of the regression params joint distribution: 1 * I
    . alpha (shape, named alpha_normal_gamma) of the gamma distribution for
      lambda (precision of measurements)
    . beta (rate, beta_normal_gamma) of the same gamma distribution
   Inputs:
    node: node number
    pData: data array
*/
double GLoglikelihood_node (int node, double **pData)
{
  int i, j, k;
  static double **pdM1 = NULL;
  static double **pdM2 = NULL;

  double df, LL;

  if (!pdM1) { // stupidly large arrays
    int dim = (nNodes > nData ? nNodes : nData);
    pdM1 = InitdMatrix(dim, dim);
    pdM2 = InitdMatrix(dim, dim);
  }

  // the vector mu of prior data expectations is null, so forget it

  // we need the data precision matrix (1 - x inv(x'x + n0) x') alpha / beta
  // see Bernardo, appendix

  // nzero is taken for now to be the identity matrix

  // store t(X) * X + n_zero in pdM1, X being the design matrix
  // X is implicit given the parents of the node considered, otherwise it
  // would have a column of 1 and the data of the parents as other columns.
  // Remember that n_zero is set to the identity matrix

  pdM1[0][0] = 1 + nData; // n_zero element 1 + n * 1

  // 1st line and 1st column of pdM1
  for (j = 0; j < nParents[node]; j++) {
    pdM1[0][j+1] = 0; // set it to n_zero off diagonal element
    for (k = 0; k < nData; k++) {
      pdM1[0][j+1] += pData[index_parents[node][j]][k];
    }
    pdM1[j+1][0] = pdM1[0][j+1];
  }

  // rest of pdM1
  for (i = 0; i < nParents[node]; i++) {
    for (j = i; j < nParents[node]; j++) {
      if (i == j)
        pdM1[i+1][j+1] = 1; // set it to n_zero diagonal element
      else
        pdM1[i+1][j+1] = 0; // set it to n_zero off diagonal element
      for (k = 0; k < nData; k++) {
        pdM1[i+1][j+1] += pData[index_parents[node][i]][k] *
                          pData[index_parents[node][j]][k];
      }
      pdM1[j+1][i+1] = pdM1[i+1][j+1];
    }
  }

  // invert t(X) * X + n_zero, that is: invert pdM1
  if (nParents[node] == 0) {
    pdM1[0][0] = 1 / pdM1[0][0];
  }
  else
    InvertMatrix(pdM1, 1+nParents[node]);

  // premultiply by X, store the result in pdM2
  for (i = 0; i < nData; i++) {
    for (j = 0; j < nParents[node]+1; j++) {
      pdM2[i][j] = pdM1[0][j]; // no need to multiply by 1
      for (k = 1; k < nParents[node]+1; k++) {
        pdM2[i][j] += pData[index_parents[node][k-1]][i] *
                      pdM1[k][j];
      }
    }
  }

  /* postmutiply by t(X), subtract from I and multiply by alpha / beta;
     use pdM1 to store the result */
  for (i = 0; i < nData; i++) {
    for (j = i; j < nData; j++) {
      if (i == j) {
        pdM1[i][j] = 1 - pdM2[i][0]; // no need to multiply by 1
      }
      else {
        pdM1[i][j] =   - pdM2[i][0]; // no need to multiply by 1
      }
      for (k = 1; k < nParents[node]+1; k++) {
        pdM1[i][j] -= pdM2[i][k] *
                      pData[index_parents[node][k-1]][j];
      }
      pdM1[i][j] *= alpha_normal_gamma / beta_normal_gamma;
      pdM1[j][i] = pdM1[i][j];
    }
  }

  // degrees of freedom
  df = 2 * alpha_normal_gamma;

  // evaluate the data density
  LL = LnMultivariateT(pData[node], nData, pdM1, df);

  return(LL);

} /* GLoglikelihood_node */


/* ----------------------------------------------------------------------------
   GLoglikelihood_node_DBN

   Computes the log-likelihood of the data for one node, given a normal-gamma
   prior and the values of its parents (taken as regressors X)

   This is the DBN (dynamic BN) version, where the child value at time t
   depends on its parents values at time t-1. So basically the index
   i in Data[node][i] needs to run from 1 to nData and needs to run from
   0 to nData - 1 in Data[index_parents[...]][i].

   The prior hyperparameter are (see Bernardo's book appendix)
    . mean of regression parameters : all null
    . precision matrix of the regression params joint distribution: 1 * I
    . alpha (shape, named alpha_normal_gamma) of the gamma distribution for
      lambda (precision of measurements)
    . beta (rate, beta_normal_gamma) of the same gamma distribution
   Inputs:
    node: node number
    pData: data array
*/
double GLoglikelihood_node_DBN (int node, double **pData)
{
  int i, j, k;
  static double **pdM1 = NULL;
  static double **pdM2 = NULL;
  static int nDataM1;

  double df, LL;

  if (!pdM1) { // stupidly large arrays
    int dim = (nNodes + 1 > nData ? nNodes + 1 : nData);
    pdM1 = InitdMatrix(dim, dim);
    pdM2 = InitdMatrix(dim, dim);

    // For DBNs there are in fact n-1 data, compute it once
    nDataM1 = nData - 1;
  }

  // the vector mu of prior data expectations is null, so forget it

  // we need the data precision matrix (1 - x inv(x'x + n0) x') alpha / beta
  // see Bernardo, appendix

  // nzero is taken for now to be the identity matrix

  // store t(X) * X + n_zero in pdM1, X being the design matrix
  // X is implicit given the parents of the node considered, otherwise it
  // would have a column of 1 and the data of the parents as other columns.
  // Remember that n_zero is set to the identity matrix
  // Since we are in the DBN case X has only lines from 0 to nData - 1

  pdM1[0][0] = nData; // n_zero element 1 + (n - 1) * 1

  // 1st line and 1st column of pdM1
  for (j = 0; j < nParents[node]; j++) {
    pdM1[0][j+1] = 0; // set it to n_zero off diagonal element
    for (k = 0; k < nDataM1; k++) {
      pdM1[0][j+1] += pData[index_parents[node][j]][k];
    }
    pdM1[j+1][0] = pdM1[0][j+1];
  }

  // rest of pdM1
  for (i = 0; i < nParents[node]; i++) {
    for (j = i; j < nParents[node]; j++) {
      if (i == j)
        pdM1[i+1][j+1] = 1; // set it to n_zero diagonal element
      else
        pdM1[i+1][j+1] = 0;   // set it to n_zero off diagonal element
      for (k = 0; k < nDataM1; k++) {
        pdM1[i+1][j+1] += pData[index_parents[node][i]][k] *
                          pData[index_parents[node][j]][k];
      }
      pdM1[j+1][i+1] = pdM1[i+1][j+1];
    }
  }

  // invert t(X) * X + n_zero, that is: invert pdM1
  if (nParents[node] == 0) {
    pdM1[0][0] = 1 / pdM1[0][0];
  }
  else
    InvertMatrix(pdM1, 1+nParents[node]);

  // premultiply by X, store the result in pdM2
  for (i = 0; i < nDataM1; i++) {
    for (j = 0; j < nParents[node]+1; j++) {
      pdM2[i][j] = pdM1[0][j]; // no need to multiply by 1
      for (k = 1; k < nParents[node]+1; k++) {
        pdM2[i][j] += pData[index_parents[node][k-1]][i] *
                      pdM1[k][j];
      }
    }
  }

  /* postmutiply by t(X), subtract from I and multiply by alpha / beta;
     use pdM1 to store the result */
  for (i = 0; i < nDataM1; i++) {
    for (j = i; j < nDataM1; j++) {
      if (i == j) {
        pdM1[i][j] = 1 - pdM2[i][0]; // no need to multiply by 1
      }
      else {
        pdM1[i][j] =   - pdM2[i][0]; // no need to multiply by 1
      }
      for (k = 1; k < nParents[node]+1; k++) {
        pdM1[i][j] -= pdM2[i][k] *
                      pData[index_parents[node][k-1]][j];
      }
      pdM1[i][j] *= alpha_normal_gamma / beta_normal_gamma;
      pdM1[j][i] = pdM1[i][j];
    }
  }

  // degrees of freedom
  df = 2 * alpha_normal_gamma;

  // evaluate the data density, data run from index 1 to nData
  LL = LnMultivariateT(pData[node]+1, nDataM1, pdM1, df);

  return(LL);

} /* GLoglikelihood_node_DBN */


/* ----------------------------------------------------------------------------
   GPostPredictiveSample_node

   Returns samples from the posterior predictive distribution of data for 
   a given node, given a regression model, a normal-gamma prior on its 
   parameters, the values of its parents (taken as regressors X), and observed
   data value (on the node of interest).
   The prior hyperparameter are (see Bernardo's book appendix)
    . mean of regression parameters : all null
    . precision matrix of the regression params joint distribution: 1 * I
    . alpha (shape, named alpha_normal_gamma) of the gamma distribution for
      lambda (precision of measurements)
    . beta (rate, beta_normal_gamma) of the same gamma distribution
   Inputs:
    node: node number
    pData: data matrix
    pSamples: a properly allocated array of sampled values
   Output:
    pSamples: a properly allocated array of sampled values, overwritten
*/
double GPostPredictiveSample_node (int node, double **pData, double *pSamples)
{
  int i, j, k;
  static double **pdM1 = NULL;
  static double **pdM2 = NULL;

  double df, LL;


#ifdef ndef
===========

node=1
parents = c(2, 3)  # list of parents
eta        = length(parents)  # number of parents of node 1
k           = eta + 1 # number of regression parameters
n           = 4
dat = matrix(c(7.1, 1.3, 2.1, 0.6,
               0.4, 3.1, 2.2, 5.5,
               2.0, 1.7, 12., 7.4), nrow=4)

# new design matrix
X = matrix(data = c(rep(1, n), dat[, parents]), ncol = 1 + eta, nrow = n)

# data for the node we are looking at, missing data just not looked at
y = as.matrix(dat[,node])

# the regression parameter posterior is bivariate t
theta_n = solve(n_zero + t(X) %*% X) %*% (t(X) %*% y)

beta_n = beta + 0.5 * (t(y - X %*% theta_n) %*% y)

// the predictive posterior for a data point is univariate t
// for all data it's multi-T (simple).
i = 1
xi = as.vector(X[i,])
yi = y[i]

mu = xi %*% theta_n
fnx = diag(1,n) - xi %*%
              solve(as.matrix(xi, ncol=1) %*% xi + n_zero + t(X) %*% X) %*%
              as.matrix(xi, ncol=1)
precision = fnx * (alpha + n / 2) / beta_n
df = 2 * alpha + n
dTm(yi, 1, mu, precision, df)

===========
#endif

  if (!pdM1) { // stupidly large arrays
    int dim = (nNodes > nData ? nNodes : nData);
    pdM1 = InitdMatrix(dim, dim);
    pdM2 = InitdMatrix(dim, dim);
  }

  /* the vector mu (aka theta_zero) of prior data expectations is null,
     so forget it

     nzero is taken for now to be the identity matrix

     the calculation goes as:

     theta_n = solve(n_zero + t(X) * X) * (t(X) * y)

     beta_n = beta + 0.5 * (t(y - X * theta_n) * y)

     mu = X[k] * theta_n
     fnx = diag(1,n) -
           X[k] * solve(t(X[k] * X[k] + n_zero + t(X) * X) * t(X[k])

     precision = fnx * (alpha + n / 2) / beta_n
     df = 2 * alpha + n
     data Tm(y[k], 1, mu, precision, df)
     the predictive posterior for a data point k is univariate t
     with parameters mu, precision, df;

     for all data it should be a multivariate T. */

  // step 1
  // store t(X) * X + n_zero in pdM1, X being the design matrix
  // X is implicit given the parents of the node considered, otherwise it
  // would have a column of 1 and the data of the parents as other columns.
  // Remember that n_zero is set to the identity matrix

  pdM1[0][0] = 1 + nData; // n_zero element 1 + n * 1

  // 1st line and 1st column of pdM1
  for (j = 0; j < nParents[node]; j++) {
    pdM1[0][j+1] = 0; // set it to n_zero off diagonal element
    for (k = 0; k < nData; k++) {
      pdM1[0][j+1] += pData[index_parents[node][j]][k];
    }
    pdM1[j+1][0] = pdM1[0][j+1];
  }

  // rest of pdM1
  for (i = 0; i < nParents[node]; i++) {
    for (j = i; j < nParents[node]; j++) {
      if (i == j)
        pdM1[i+1][j+1] = 1; // set it to n_zero diagonal element
      else
        pdM1[i+1][j+1] = 0;   // set it to n_zero off diagonal element
      for (k = 0; k < nData; k++) {
        pdM1[i+1][j+1] += pData[index_parents[node][i]][k] *
                          pData[index_parents[node][j]][k];
      }
      pdM1[j+1][i+1] = pdM1[i+1][j+1];
    }
  }

  // invert t(X) * X + n_zero, that is: invert pdM1
  if (nParents[node] == 0) {
    pdM1[0][0] = 1 / pdM1[0][0];
  }
  else
    InvertMatrix(pdM1, 1+nParents[node]);

  // step 2
  // form t(X) * y

  printf ("GPostPredictiveSample_node not finished - Exiting.\n\n");
  exit(0);

  // step 3
  // form theta_n = pdM1 * (t(X) * y)

  // ...

  // premultiply by X, store the result in pdM2
  for (i = 0; i < nData; i++) {
    for (j = 0; j < nParents[node]+1; j++) {
      pdM2[i][j] = pdM1[0][j]; // no need to multiply by 1
      for (k = 1; k < nParents[node]+1; k++) {
        pdM2[i][j] += pData[index_parents[node][k-1]][i] *
                      pdM1[k][j];
      }
    }
  }

  /* postmutiply by t(X), subtract from I and multiply by alpha / beta;
     use pdM1 to store the result */
  for (i = 0; i < nData; i++) {
    for (j = i; j < nData; j++) {
      if (i == j) {
        pdM1[i][j] = 1 - pdM2[i][0]; // no need to multiply by 1
      }
      else {
        pdM1[i][j] =   - pdM2[i][0]; // no need to multiply by 1
      }
      for (k = 1; k < nParents[node]+1; k++) {
        pdM1[i][j] -= pdM2[i][k] *
                      pData[index_parents[node][k-1]][j];
      }
      pdM1[i][j] *= alpha_normal_gamma / beta_normal_gamma;
      pdM1[j][i] = pdM1[i][j];
    }
  }

  // degrees of freedom
  df = 2 * alpha_normal_gamma;

  // evaluate the data density
  LL = LnMultivariateT(pData[node], nData, pdM1, df);

  return(LL);

} /* GPostPredictiveSample_node */


/* ----------------------------------------------------------------------------
   This is the famous heapsort
*/
void sort(long n, double *vect)
{
  int i, j, k, l;
  double temp;

  k = (n >> 1) + 1;
  l = n;
  for (;;) {
    if (k > 1)
      temp = vect[--k - 1];
    else {
      temp = vect[l-1];
      vect[l-1] = vect[0];
      if (--l == 0) {
        vect[0] = temp;
        return;
      }
    }
    i = k;
    j = k << 1;
    while (j <= l) {
      if (j < l && vect[j-1] < vect[j]) ++j;
      if (temp < vect[j-1]) {
        vect[i-1] = vect[j-1];
        j += (i = j);
      }
      else j = l+1;
    }
    vect[i-1] = temp;
  }
} /* sort */


/* ----------------------------------------------------------------------------
   DLoglikelihood_node

   Computes the log-likelihood of the data for one node, given a Dirichlet
   prior and multinomial data.
   Inputs:
    node: node number
    pData: data array, with levels coded 0, 1, ...

   For a good explanation see:
   - Laskey and Myers, 2003, Machine Learning, 50:175-196.
   For some more detail see:
   - Heckerman et al., 1994, in Proceedings of Tenth Conference on Uncertainty
     in Artificial Intelligence, Seattle, WA, p. 293-301. Morgan Kanfmann.
   - Heckerman et al., 1995, Machine Learning, 20, 197-243
*/
double DLoglikelihood_node (int node, double **pData)
{
  int    i, j, nConfigs;

  double LL, N_prime_ij, N_prime_ijk;

  static int    *piCardConfig      = NULL;
  static double *pdCodesP          = NULL;
  static double *pdCodesPE         = NULL;
  static double *pdCumConfigNumber = NULL;
  static double *pdIndexConfig     = NULL;

  if (!pdCodesPE) {
    pdCodesP          = InitdVector(nData);
    pdCodesPE         = InitdVector(nData);
    pdIndexConfig     = InitdVector(nData);
    piCardConfig      = InitiVector(nData);
    pdCumConfigNumber = InitdVector(nNodes);
  }

  // Dirichlet prior sample size of any given configuration of parents values.
  // case no parents or uniform:
  N_prime_ijk = 1.0;

  // another possibility, if there are parents, is to set N_prime_ijk at
  // 1 / number of configurations of parents = 1 / prod_(pDataLevels[parents]).
  // That should penalize higher number of parents
#ifdef NDEF
  for (i = 0; i < nParents[node]; i++)
    N_prime_ijk /= (double) pDataLevels[index_parents[node][i]];
    // in any case, that calculation can be omitted if pdCumConfigNumber
    // is assigned to one, below
#endif

  // marginal prior sample size on node: pDataLevels[node] * N_prime_ijk.
  // the actual detailed calculation is the sum from 1 to pDataLevels[node]
  // of the prior sample sizes for each configuration of parents
  N_prime_ij = N_prime_ijk * pDataLevels[node];

  // cumulated products of levels for configurations encoding
  pdCumConfigNumber[0] = pDataLevels[node];
  for (i = 0; i < nParents[node]; i++)
    pdCumConfigNumber[i+1] = pdCumConfigNumber[i] *
                             pDataLevels[index_parents[node][i]];

  // encoding of node and parents configurations:
  for (i = 0; i < nData; i++) {
    pdCodesPE[i] = pData[node][i];
    for (j = 0; j < nParents[node]; j++)
      pdCodesPE[i] += pData[index_parents[node][j]][i] * pdCumConfigNumber[j];
  }

  // form the codes of just the parents configurations to form the marginals
  // do this before sorting pdCodesPE!
  // sort the parents configurations if needed
  if (nParents[node] > 0) {
    for (i = 0; i < nData; i++)
      pdCodesP[i] = pdCodesPE[i] - pData[node][i];
    sort(nData, pdCodesP);
  }

  // sort the various node and parents configurations
  sort(nData, pdCodesPE);

  // count (tabulate) the nConfigs unique node and parents configurations
  j = 0;
  pdIndexConfig[j] = pdCodesPE[0];
  piCardConfig[j] = 1;
  for (i = 1; i < nData; i++) {
    if (pdCodesPE[i] == pdIndexConfig[j])
      piCardConfig[j]++;
    else {
      j++;
      pdIndexConfig[j] = pdCodesPE[i];
      piCardConfig[j] = 1;
    }
  }
  nConfigs = j + 1;

  LL = 0;

  // term for updated counts
  for (i = 0; i < nConfigs; i++)
    LL += LnGamma(N_prime_ijk + piCardConfig[i]);

  // term for prior, saving time if LnGamma is zero
  if ((N_prime_ijk != 1) && (N_prime_ijk != 2))
    LL -= nConfigs * LnGamma(N_prime_ijk);

  // now deal with the marginal terms:
  // count (tabulate) the nConfigs unique parents configurations
  if (nParents[node] == 0) {
    piCardConfig[0] = nData;
    nConfigs = 1;
  }
  else {
    j = 0;
    pdIndexConfig[j] = pdCodesP[0];
    piCardConfig[j] = 1;
    for (i = 1; i < nData; i++) {
      if (pdCodesP[i] == pdIndexConfig[j])
        piCardConfig[j]++;
      else {
        j++;
        pdIndexConfig[j] = pdCodesP[i];
        piCardConfig[j] = 1;
      }
    }
    nConfigs = j + 1;
  }

  // term for updated marginal counts
  for (i = 0; i < nConfigs; i++)
    LL -= LnGamma(N_prime_ij + piCardConfig[i]);

  // term for marginal prior, saving time if LnGamma is zero
  if ((N_prime_ij != 1) && (N_prime_ij != 2))
    LL += nConfigs * LnGamma(N_prime_ij);

  return(LL);

} /* DLoglikelihood_node */


/* ----------------------------------------------------------------------------
   DLoglikelihood_node_DBN

   Computes the log-likelihood of the data for one node, given a Dirichlet
   prior and multinomial data.
   Inputs:
    node: node number
    pData: data array, with levels coded 0, 1, ...

   This is the DBN (dynamic BN) version, where the child value at time t
   depends on its parents values at time t-1. So basically the index
   i in Data[node][i] needs to run from 1 to nData and needs to run from
   0 to nData - 1 in Data[index_parents[...]][i].

   For a good explanation see:
   - Laskey and Myers, 2003, Machine Learning, 50:175-196.
   For some more detail see:
   - Heckerman et al., 1994, in Proceedings of Tenth Conference on Uncertainty
     in Artificial Intelligence, Seattle, WA, p. 293-301. Morgan Kanfmann.
   - Heckerman et al., 1995, Machine Learning, 20, 197-243
*/
double DLoglikelihood_node_DBN (int node, double **pData)
{
  int    i, j, nConfigs;

  double LL, N_prime_ij, N_prime_ijk;

  static int    *piCardConfig      = NULL;
  static double *pdCodesP          = NULL;
  static double *pdCodesPE         = NULL;
  static double *pdCumConfigNumber = NULL;
  static double *pdIndexConfig     = NULL;
  static int     nDataM1;

  if (!pdCodesPE) {
    pdCodesP          = InitdVector(nData);
    pdCodesPE         = InitdVector(nData);
    pdIndexConfig     = InitdVector(nData);
    piCardConfig      = InitiVector(nData);
    pdCumConfigNumber = InitdVector(nNodes);

    // For DBNs there are in fact n-1 data, compute it once
    nDataM1 = nData - 1;
  }

  // Dirichlet prior sample size of any given configuration of parents values.
  // case no parents or uniform:
  N_prime_ijk = 1.0;

  // another possibility, if there are parents, is to set N_prime_ijk at
  // 1 / number of configurations of parents = 1 / prod_(pDataLevels[parents]).
  // That should penalize higher number of parents
#ifdef NDEF
  for (i = 0; i < nParents[node]; i++)
    N_prime_ijk /= (double) pDataLevels[index_parents[node][i]];
    // in any case, that calculation can be omitted if pdCumConfigNumber
    // is assigned to one, below
#endif
// if that part of code is enabled you should also enable the one below

  // marginal prior sample size on node: pDataLevels[node] * N_prime_ijk.
  // the actual detailed calculation is the sum from 1 to pDataLevels[node]
  // of the prior sample sizes for each configuration of parents
  N_prime_ij = N_prime_ijk * pDataLevels[node];

  // cumulated products of levels for configurations encoding
  pdCumConfigNumber[0] = pDataLevels[node];
  for (i = 0; i < nParents[node]; i++)
    pdCumConfigNumber[i+1] = pdCumConfigNumber[i] *
                             pDataLevels[index_parents[node][i]];

  // encoding of node and parents configurations:
  for (i = 0; i < nDataM1; i++) {
    pdCodesPE[i] = pData[node][i+1];
    for (j = 0; j < nParents[node]; j++)
      pdCodesPE[i] += pData[index_parents[node][j]][i] * pdCumConfigNumber[j];
  }

  // form the codes of just the parents configurations to form the marginals
  // do this before sorting pdCodesPE!
  // sort the parents configurations if needed
  if (nParents[node] > 0) {
    for (i = 0; i < nDataM1; i++)
      pdCodesP[i] = pdCodesPE[i] - pData[node][i+1];
    sort(nDataM1, pdCodesP);
  }

  // sort the various node and parents configurations
  sort(nDataM1, pdCodesPE);

  // count (tabulate) the nConfigs unique node and parents configurations
  j = 0;
  pdIndexConfig[j] = pdCodesPE[0];
  piCardConfig[j] = 1;
  for (i = 1; i < nDataM1; i++) {
    if (pdCodesPE[i] == pdIndexConfig[j])
      piCardConfig[j]++;
    else {
      j++;
      pdIndexConfig[j] = pdCodesPE[i];
      piCardConfig[j] = 1;
    }
  }
  nConfigs = j + 1;

  LL = 0;

  // term for updated counts
  for (i = 0; i < nConfigs; i++)
    LL += LnGamma(N_prime_ijk + piCardConfig[i]);

#ifdef NDEF
  // term for prior, saving time if LnGamma is zero
  if ((N_prime_ijk != 1) && (N_prime_ijk != 2))
    LL -= nConfigs * LnGamma(N_prime_ijk);
#endif

  // now deal with the marginal terms:
  // count (tabulate) the nConfigs unique parents configurations
  if (nParents[node] == 0) {
    piCardConfig[0] = nDataM1;
    nConfigs = 1;
  }
  else {
    j = 0;
    pdIndexConfig[j] = pdCodesP[0];
    piCardConfig[j] = 1;
    for (i = 1; i < nDataM1; i++) {
      if (pdCodesP[i] == pdIndexConfig[j])
        piCardConfig[j]++;
      else {
        j++;
        pdIndexConfig[j] = pdCodesP[i];
        piCardConfig[j] = 1;
      }
    }
    nConfigs = j + 1;
  }

  // term for updated marginal counts
  for (i = 0; i < nConfigs; i++)
    LL -= LnGamma(N_prime_ij + piCardConfig[i]);

  // term for marginal prior, saving time if LnGamma is zero
  if ((N_prime_ij != 1) && (N_prime_ij != 2))
    LL += nConfigs * LnGamma(N_prime_ij);

  return(LL);

} /* DLoglikelihood_node_DBN */


/* ----------------------------------------------------------------------------
   ZLoglikelihood_node

   Computes the log-likelihood of the data for one node, given a Zellner
   prior and the values of its parents (taken as regressors X)
   The Zellner prior is improper and cannot be computed if there is the same
   number of or more parents than data points for the node considered.
   Inputs:
    node: node number
    pData: data array

   Computation proceeds by forming
   mx = Y' * Y - g_z/(g_z + 1) * (Y' * X) * inv((X' * X)) * (X' * Y)
   where g_z is a tuning parameter
   and then
   Loglikelihood = -(eta + 1)/2 * log(g_z + 1) - n/2 * log(mx)

   see Celeux, Marin, Robert, 2006,
       Selection bayesienne de variables en regression lineaire,
       Journal de la Societe Francaise de Statistique, 147:59-79 (p 63 & 68)
   and Smith & Kohn, 1996,
       Nonparametric Regression using Bayesian Variable Selection,
       Journal of Econometrics 75:317–343.

   were Y are the data for the node considered and X the design matrix
   (a column of 1 for the constant term and a column of data for each parent,
   the data of the parents being taken as regressors), n is the number of data
   points for node, eta is the number of parents of the node considered.
*/
double ZLoglikelihood_node (int node, double **pData)
{
  int i, j, k;
  static double **pdM1 = NULL;
  static double *pdV1  = NULL;
  static double *pdV2  = NULL;
  double mx;
  double LL;

  if (!pdM1) { // stupidly large arrays
    int dim = (nNodes > nData ? nNodes : nData); // the largest
    pdM1 = InitdMatrix(dim, dim);

    dim = (nNodes > nData ? nData : nNodes); // the smallest
    pdV1 = InitdVector(dim);
    pdV2 = InitdVector(dim);
  }

  // start with mx = Y' * Y
  mx = 0;
  for (i = 0; i < nData; i++) {
    mx += pow(pData[node][i], 2);
  }

  // do Y' * X

  // all elements of the first column of X are at 1
  pdV1[0] = pData[node][0];
  for (i = 1; i < nData; i++) {
    pdV1[0] += pData[node][i];
  }
  for (j = 0; j < nParents[node]; j++) {
    pdV1[j+1] = pData[node][0] * pData[index_parents[node][j]][0];
    for (i = 1; i < nData; i++) {
      pdV1[j+1] += pData[node][i] * pData[index_parents[node][j]][i];
    }
  }

  // do X' * X

  pdM1[0][0] = nData; // n * 1

  for (j = 0; j < nParents[node]; j++) { // take care of the line of t(X) * X
    pdM1[0][j+1] = 0;
    for (k = 0; k < nData; k++) {
      pdM1[0][j+1] += pData[index_parents[node][j]][k];
    }
    pdM1[j+1][0] = pdM1[0][j+1];
  }
	
  for (i = 0; i < nParents[node]; i++) {
    for (j = i; j < nParents[node]; j++) {
      pdM1[i+1][j+1] = 0;
      for (k = 0; k < nData; k++) {
        pdM1[i+1][j+1] += pData[index_parents[node][i]][k] *
                          pData[index_parents[node][j]][k];
      }
      pdM1[j+1][i+1] = pdM1[i+1][j+1];
    }
  }

  // invert X' * X, that is: invert pdM1
  if (nParents[node] == 0) {
    pdM1[0][0] = 1 / pdM1[0][0];
  }
  else
    InvertMatrix(pdM1, 1+nParents[node]);

  // do (Y' * X) * inv((X' * X)), that is pdV1 * pdM1

  for (j = 0; j <= nParents[node]; j++) {
    pdV2[j] = pdV1[0] * pdM1[0][j];
    for (i = 1; i <= nParents[node]; i++) {
      pdV2[j] += pdV1[i] * pdM1[j][i];
    }
  }

  // do (Y' * X) * inv((X' * X)) * (X' * Y), that is pdV2 * t(pdV1)

  pdV2[0] = pdV2[0] * pdV1[0];
  for (i = 1; i <= nParents[node]; i++) {
    pdV2[0] += pdV2[i] * pdV1[i];
  }

  // finish mx as mx + pdV2 * t(pdV1)
  mx = mx - gamma_zellner / (gamma_zellner + 1) * pdV2[0];

  LL = -0.5 * ((nParents[node] + 1) * log(gamma_zellner + 1) +
               (nData * log(mx)));

  return(LL);

} /* ZLoglikelihood_node */


/* ----------------------------------------------------------------------------
   ZLoglikelihood_node_DBN

   Computes the log-likelihood of the data for one node, given a Zellner
   prior and the values of its parents (taken as regressors X)
   The Zellner prior is improper and cannot be computed if there is the same
   number of or more parents than data points for the node considered.
   Inputs:
    node: node number
    pData: data array

   This is the DBN (dynamic BN) version, where the child value at time t
   depends on its parents values at time t-1. So basically the index
   i in Data[node][i] needs to run from 1 to nData and needs to run from
   0 to nData - 1 in Data[index_parents[...]][i].

   Computation proceeds by forming
   mx = Y' * Y - g_z/(g_z + 1) * (Y' * X) * inv((X' * X)) * (X' * Y)
   where g_z is a tuning parameter
   and then
   Loglikelihood = -(eta + 1)/2 * log(g_z + 1) - n/2 * log(mx)

   see Celeux, Marin, Robert, 2006,
       Selection bayesienne de variables en regression lineaire,
       Journal de la Societe Francaise de Statistique, 147:59-79 (p 63 & 68)
   and Smith & Kohn, 1996,
       Nonparametric Regression using Bayesian Variable Selection,
       Journal of Econometrics 75:317–343.

   were Y are the data for the node considered and X the design matrix
   (a column of 1 for the constant term and a column of data for each parent,
   the data of the parents being taken as regressors), n is the number of data
   points for node, eta is the number of parents of the node considered.
*/
double ZLoglikelihood_node_DBN (int node, double **pData)
{
  int i, j, k;
  static double **pdM1 = NULL;
  static double *pdV1  = NULL;
  static double *pdV2  = NULL;
  static int nDataM1;
  double mx, LL;

  if (!pdM1) { // stupidly large arrays
    int dim = (nNodes + 1 > nData ? nNodes + 1 : nData); // the largest
    pdM1 = InitdMatrix(dim, dim);

    dim = (nNodes + 1 > nData ? nData : nNodes + 1); // the smallest
    pdV1 = InitdVector(dim);
    pdV2 = InitdVector(dim);

    // For DBNs there are in fact n-1 data, compute it once
    nDataM1 = nData - 1;
  }

  // start with mx = Y' * Y
  mx = 0;
  for (i = 1; i < nData; i++) { // shifted for DBN
    mx += pow(pData[node][i], 2);
  }

  // do Y' * X

  // all elements of the first column of X are at 1, watch the DBN shift
  pdV1[0] = pData[node][1]; // shift down
  for (i = 2; i < nData; i++) {
    pdV1[0] += pData[node][i];
  }
  for (j = 0; j < nParents[node]; j++) { // unmatching indices for DBN
    pdV1[j+1] = pData[node][1] * pData[index_parents[node][j]][0];
    for (i = 2; i < nData; i++) {
      pdV1[j+1] += pData[node][i] * pData[index_parents[node][j]][i-1];
    }
  }

  // do X' * X

  pdM1[0][0] = nDataM1; // (n - 1) * 1

  for (j = 0; j < nParents[node]; j++) { // take care of the line of t(X) * X
    pdM1[0][j+1] = 0;
    for (k = 0; k < nDataM1; k++) {
      pdM1[0][j+1] += pData[index_parents[node][j]][k];
    }
    pdM1[j+1][0] = pdM1[0][j+1];
  }

  for (i = 0; i < nParents[node]; i++) {
    for (j = i; j < nParents[node]; j++) {
      pdM1[i+1][j+1] = 0;
      for (k = 0; k < nDataM1; k++) {
        pdM1[i+1][j+1] += pData[index_parents[node][i]][k] *
                          pData[index_parents[node][j]][k];
      }
      pdM1[j+1][i+1] = pdM1[i+1][j+1];
    }
  }

  // invert X' * X, that is: invert pdM1
  if (nParents[node] == 0) {
    pdM1[0][0] = 1 / pdM1[0][0];
  }
  else
    InvertMatrix(pdM1, 1+nParents[node]);

  // do (Y' * X) * inv((X' * X)), that is pdV1 * pdM1

  for (j = 0; j <= nParents[node]; j++) {
    pdV2[j] = pdV1[0] * pdM1[0][j];
    for (i = 1; i <= nParents[node]; i++) {
      pdV2[j] += pdV1[i] * pdM1[j][i];
    }
  }

  // do (Y' * X) * inv((X' * X)) * (X' * Y), that is pdV2 * t(pdV1)

  pdV2[0] = pdV2[0] * pdV1[0];
  for (i = 1; i <= nParents[node]; i++) {
    pdV2[0] += pdV2[i] * pdV1[i];
  }

  // finish mx as mx + pdV2 * t(pdV1) just computed and stored in pdV2
  mx = mx - gamma_zellner / (gamma_zellner + 1) * pdV2[0];

  // specific to DBN: one less data point
  LL = -0.5 * ((nParents[node] + 1) * log(gamma_zellner + 1) +
               (nDataM1 * log(mx)));

  return(LL);

} /* ZLoglikelihood_node_DBN */


/* end */
