/*
  Manage a Markov model
*/


#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include "my_types.h"
#include "data_interface.h"
#include "misc_functions.h"
#include "markov.h"


#define ALPHAS 4
#define PPIND(past,present) (past*ALPHAS + present)


static unsigned long *AlphaPows=NULL;
static int maxOrder = -1;


static void computeAlphaPows(int order);
static double **allocTransitionP(int order);
static void countAllTransitions(iLetterVec trainData, int order, double **transitionC);



MarkovModelType trainMarkovModel(iLetterVec trainData, int order, double pseudoCount, int scanMode)
     /*
       Train the Markov model from training data
       Effective pseudocount for Markov training = pseudoCount * letterFrequency
       scanMode (of train file): 0 = only forward, 2 = both strands
     */
{
  int i, j, pastSize;
  MarkovModelType MM;
  unsigned int count;
  iLetterVec RCtrainData;

  // evaluate the integer powers of ALPHAS (this would be handy later on here and elsewhere)
  if (order > maxOrder) {
    computeAlphaPows(order);
    maxOrder = order;
  }

  MM.transitionP = allocTransitionP(order);
  countAllTransitions(trainData, order, MM.transitionP);
  if (scanMode == 2) {
  	RCtrainData = alloc_iLetterVec(trainData.len);
	for (i = 0; i < trainData.len; i++)
	  RCtrainData.entry[i] = (ALPHAS-1) - trainData.entry[trainData.len-i-1]; // this only makes sense for DNA
	countAllTransitions(RCtrainData, order, MM.transitionP);
	FREE_ATOMS_VEC(RCtrainData);
  }

  // turn counts to transition probabilities
  count = 0;
  for (j = 0; j < ALPHAS; j++)
    count += MM.transitionP[0][j];
  for (j = 0; j < ALPHAS; j++)
    MM.transitionP[0][j] = (MM.transitionP[0][j] + pseudoCount/ALPHAS) / (count + pseudoCount);
  for (pastSize = 1; pastSize <= order; pastSize++) {
    for (i = 0; i < AlphaPows[pastSize]; i++) {
      count = 0;
      for (j = 0; j < ALPHAS; j++)
	count += MM.transitionP[pastSize][PPIND(i,j)];
      for (j = 0; j < ALPHAS; j++)
	MM.transitionP[pastSize][PPIND(i,j)] = 
	  (MM.transitionP[pastSize][PPIND(i,j)] + pseudoCount*MM.transitionP[0][j]) / (count + pseudoCount);
    }
  }

  MM.logTransitionP = allocTransitionP(order);
  for (pastSize = 0; pastSize <= order; pastSize++)
    for (i = 0; i < AlphaPows[pastSize]; i++)
      for (j = 0; j < ALPHAS; j++)
	MM.logTransitionP[pastSize][PPIND(i,j)] = log(MM.transitionP[pastSize][PPIND(i,j)]);

  MM.order = order;
  return MM;
}

/* 
 writes the Markov Model model represented by MM to the file specified by fname
 return true if successful, false otherwise
 */
Boolean writeMarkovModelToFile(MarkovModelType MM, char* fname)	{
	int i, j, k;
	FILE* bgfile = fopen(fname, "w");
	if (bgfile != NULL)	{
		fprintf(bgfile, "%d\n", MM.order);
		unsigned long p = 1;
		for (k = 0; k <= MM.order; k++)	{
			for (i = 0; i < p; i++)	{
				for (j = 0; j < ALPHAS; j++)	{
					fprintf(bgfile, "%0.16lf\n", MM.transitionP[k][PPIND(i,j)]);
				}
			}
			p *= ALPHAS;
		}
		fclose(bgfile);
		return TRUE;
	}
	return FALSE;
}


MarkovModelType readMarkovModelFromFile(char* fname)	{
	int order, i, j, pastSize;
	MarkovModelType MM;
	FILE* bgfile;
	
	bgfile = fopen(fname, "r");
	fscanf(bgfile, "%d", &order);
	if (order > maxOrder) {
    	computeAlphaPows(order);
    	maxOrder = order;
  	}
	
	MM.order = order;
	MM.transitionP = allocTransitionP(order);
	MM.logTransitionP = allocTransitionP(order);
	
	MM.transitionP = allocTransitionP(order);
	for (pastSize = 0; pastSize <= order; pastSize++)
		for (i = 0; i < AlphaPows[pastSize]; i++)
	      	for (j = 0; j < ALPHAS; j++)	{
				fscanf(bgfile, "%lf", &MM.transitionP[pastSize][PPIND(i,j)]);
				MM.logTransitionP[pastSize][PPIND(i,j)] = log(MM.transitionP[pastSize][PPIND(i,j)]);
			}
	
	fclose(bgfile);
	
	return MM;
}


double **allocTransitionP(int order)
     // allocate the transition probability matrix (including lower order ones)
{
  double **transitionP;
  int i;

  assert( transitionP = (void *) calloc(order+1, sizeof(double *)) );
  for (i = 0; i <= order; i++)
    assert( transitionP[i] = (double *) calloc(AlphaPows[i]*ALPHAS, sizeof(double)) );
  return transitionP;
}


void countAllTransitions(iLetterVec trainData, int order, double **transitionC)
  // count all transitions
{
  int i, j, past, pastSize, present;
  
  for (pastSize = 0; pastSize <= order; pastSize++) {
    past = 0;
    for (j = 0; j < pastSize; j++)
      past = past * ALPHAS + trainData.entry[j];
    for (i = pastSize; i < trainData.len; i++) {
      present = trainData.entry[i];
      transitionC[pastSize][PPIND(past, present)] ++;
      past = (past * ALPHAS + present) % AlphaPows[pastSize];
    }
  }
}


iLetterVec genMarkovBlock(MarkovModelType MM, int len)
     // Generate a realization of the Markov chain
{
  iLetterVec block;
  int past, pastSize, present, i;

  block = alloc_iLetterVec(len);
  past = 0;
  pastSize = 0;
  for (i = 0; i < len; i++) {
    present = multinomialDraw(MM.transitionP[pastSize]+PPIND(past,0), ALPHAS);
    block.entry[i] = present;
    pastSize = MIN2(pastSize+1, MM.order);
    past = (past  * ALPHAS + present) % AlphaPows[pastSize];
  }
  return block;
}


double getMarkovLklhd(MarkovModelType markovModel, iLetter *string, int loc, int len)
     /*
       Compute the likelihood of a substring under the markovModel conditional on its
       left flanking sequence.
       loc,len - location and length of the substring within string
       The string is assumed to have at least loc+len letters.
     */
{
  int i, past=0, pastSize;
  double lkl=0;
  iLetter present;


  pastSize = MIN2(markovModel.order, loc);
  for (i = loc-pastSize; i < loc; i++)
    past = past * ALPHAS + string[i];
  for (i = loc; i < loc+len; i++) {
    present = string[i];
    lkl += markovModel.logTransitionP[pastSize][PPIND(past,present)];
    pastSize = MIN2(pastSize+1, markovModel.order);
    past = (past  * ALPHAS + present) % AlphaPows[pastSize];
  }
  return lkl;
}


realVec getPosMarkovLklhds(MarkovModelType markovModel, iLetterVec string)
//       Compute the likelihood of a each letter in a substring under the markovModel
{
  int i, past=0, pastSize=0;
  iLetter present;
  realVec posLkl;
  
  posLkl = alloc_realVec(string.len);
  
  for (i = 0; i < string.len; i++) {
    present = string.entry[i];
    posLkl.entry[i] = markovModel.logTransitionP[pastSize][PPIND(past,present)];
    pastSize = MIN2(pastSize+1, markovModel.order);
    past = (past  * ALPHAS + present) % AlphaPows[pastSize];
  }
  return posLkl;
}



void computeAlphaPows(int order)
     // Computes the integer powers of ALPHAS
{
  int i;

  if (AlphaPows != NULL)
    free(AlphaPows);
  assert( AlphaPows = (void *) calloc(order + 1, sizeof(unsigned long)) );
  AlphaPows[0] = 1;
  for (i = 0; i < order; i++)
    AlphaPows[i+1] = AlphaPows[i] * ALPHAS;
}


void freeMarkovModel(MarkovModelType MM)
     // Frees the alocated memory
{
  int i;

  if (MM.transitionP == NULL)
    return;
  for (i = 0; i <= MM.order; i++)
    if (MM.transitionP[i] != NULL) {
      free(MM.transitionP[i]);
      free(MM.logTransitionP[i]);
    }
  free(MM.transitionP);
  free(MM.logTransitionP);
}
