#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <float.h>
#include <assert.h>
#include <time.h>
#include "my_types.h"
#include "misc_functions.h"
#include "data_interface.h"
#include "pwms.h"
#include "markov.h"
#include "sitePrtctdBtstrp.h"
#include "mc_sets.h"
#include "motif_scan.h"
#include "tests.h"
#include "diff_tests.h"
#include "rand.h"


static PWMsVec pwms;

static char *strip = "acgt";
static analyzedSetType *set1, *set2;
iLetterVec nullTrainData = {0, NULL};  // Null train data can only be stored here but other modules can access it directly (extern)

FILE *outputFile; // global variable
static time_t randSeed;

static void parse_2S_command(int argc, char **argv);
static RandomDsCreatorType * setRandTrainFile(char *MCtrainFile, char *nullTrainFileName, char *setFileName, char *otherSetFileName);
static int findOptionalParameter(char *parm, stringVec commandLine);
static int findRequiredParameter(char *parm, stringVec commandLine);
static void printGTTinfo(dsSitesStruct setSites, char *setName);



int main(int argc, char **argv) 
{
  testStruct *tests;
  int ipwm, i;
  time_t startTime;

  startTime = time(0);
  for (i = 0; i < argc; i++)
    printf("%s ", argv[i]);
  printf("\n");

  initialize_test_matrix();

  parse_2S_command(argc, argv);

  for (i = 0; i < argc; i++)
    fprintf(outputFile, "%s ", argv[i]);
  fprintf(outputFile, "\n");

  for (ipwm = 0; ipwm < pwms.len; ipwm++) {
    fprintf(outputFile, "\n---------------------------------\n%s\n", pwms.entry[ipwm].header);
    set_siteThreshold(pwms.entry+ipwm);
    tests = run_tests(set1, set2, pwms.entry[ipwm]);
    if (GTTscoresRqstd()) {
      printGTTinfo(set1->dsSites, "first set");
      printGTTinfo(set2->dsSites, "second set");
      fprintf(outputFile, "\n");
    }
    printSites(&(set1->data), &(set1->dsSites), &(pwms.entry[ipwm]), "first set", outputFile);
    printSites(&(set2->data), &(set2->dsSites), &(pwms.entry[ipwm]), "second set", outputFile);
    printTests(tests, pwms.entry[ipwm].header);
  }

  printf("\n\nThis took %d seconds to run.\n", (int)(time(0)-startTime));
  fprintf(outputFile, "\n\nThis took %d seconds to run, random seed = %d\n", (int)(time(0)-startTime), randSeed);
  fprintf(outputFile, "Compile date and time: %s %s\n",__DATE__,__TIME__);
  return 0;
}


void parse_2S_command(int argc, char **argv)
     /*
       Command line format is as follows:
     */
{
  char *synopsis =  // add control of PWM pseudocount
    "-w FILE_NAME (pwms FILE)  "        
    "[-pwmPC  #(PWM pseudocount, default=0.1)]  " 
    "-i1 FILE_NAME (set 1)  "           
    "-i2 FILE_NAME (set 2)  "           
    "-t FILE_NAME (null training file)  "    
    "-o FILE_NAME (output file)  "      
    "[-m  #(null Markov order, default=2)]  " 
    "[-totalPseusoCount  #(default=1.0, effective transition PC for Markov training = totalPC*letterFrequency)]  " 
    "{ [-siteThresholdFile <FILE_NAME>]  ||  [-siteThresholdLearnedFrom #(percentage in [0,1]) {nullMarkovBlock #(block size) | nullTrainFile} ]  }  "
    "[-v #(accepted overlap threshold, default=1)]  "
    "[-scanDirecion forward | reverse | both (default)]  "
    "[-permuteInputSequences first | second | both]  "
    "[-permutePWM]  "                     
    "-tests   "                        
    "qualityScoresGTT  [MannWhitney t-test [setWideMC perSeqMC | MC]] +  "
    "freqScoresGTT [ Hypergeometric *MC ] +  "    
    "topK #K [ MannWhitney t-test *MC ] +  "      
    "topPercent #% [ MannWhitney t-test *MC ] +  "
    "HandT [ uniformSum gammaLogProd ] +  "       
    "HandMW [ uniformSum gammaLogProd ] +  "      
    "setWideMCqualityAndFreq [ uniformSum gammaLogProd ]  --  "
    "('+' between features '--' after last one)  "
    "[-MCstatScope setWide (default) | perSeq | setAndPerSeq]  " 
    "[-MWsigEval exact | normal | NULL (=default: the program decides)]  "
    "[-numRandomSets # (default 1)]  "
    "[-set1RandTrainFile _SELF_ | _BOTH_ | _NULL_FILE_ | FILE_NAME]  "
    "[-set2RandTrainFile _SELF_ | _BOTH_ | _NULL_FILE_ | FILE_NAME]  "
    "[-MCmodel { bootstrap #(nominal chunk size) [protectSites] } | { markov #(order) [#(pseudo count)]} | wholeSeqResampRep | wholeSeqResampNoRep ]  "
    "[-randSeed #]  "
    "[-printTopKperSeq #K ]  "
    "[-printScoresGTT ]  ";

  int parmIndex, auxParmInd, iparm, i, nParms, nextTest;
  stringVec commandLine;
  double sitePercentThresh, siteThreshTrainSize, pwmPC;
  double overlapThreshold, pseudoCount;
  char *parm;
  char *scanDirsNames[] = {"forward", "reverse", "both"};
  stringVec scanDirs = {3, scanDirsNames};
  char *permuteSetsNames[] = {"first", "second", "both"};
  stringVec permuteSets = {3, permuteSetsNames};
  Boolean *parsed; // mark each string you parsed so that unused strings can be reported
  char *set1FileName, *set2FileName, *trainDataFileName, *set1RandTrainFileName, *set2RandTrainFileName;
  char *siteThresholdBlockTypeNames[] = {"nullMarkovBlock", "nullTrainFile"}; // DO NOT CHANGE ORDER !
  stringVec siteThresholdBlockTypeName = {2, siteThresholdBlockTypeNames};
  int siteThresholdBlockType;
  iLetterVec trainData;
  int nullMarkovOrder;

  commandLine.len = argc-1;
  commandLine.entry = argv+1;
  assert( parsed = calloc(commandLine.len, sizeof(Boolean)) );


  if ( commandLine.len == 0 || (parmIndex = findOptionalParameter("-h", commandLine)) > -1 ) {  // print synopsis and exit
    printf("\n%s\n", synopsis);
    exit(0);
  }



  if ( (parmIndex = findOptionalParameter("-randSeed", commandLine)) > -1 ) {  // rand seed has to be set before any randomly generated stuff! Keep on top just in case
    randSeed = my_atol(commandLine.entry[parmIndex+1]);
    setTRUE(parsed, parmIndex, 2);
  }
  else
    randSeed = time(0);
  RAND_SEED(randSeed);


  parmIndex = findRequiredParameter("-o", commandLine);  // output file
  assert( outputFile = fopen(commandLine.entry[parmIndex+1], "w") );
  setTRUE(parsed, parmIndex, 2);


  if ( (parmIndex = findOptionalParameter("-pwmPC", commandLine)) > -1 ) { // PWM pseudocount
    pwmPC = my_atod(commandLine.entry[parmIndex+1]);
    checkRangeD(pwmPC, 0, 1, "-pwmPC #");
    setTRUE(parsed, parmIndex, 2);
  }
  else //default
    pwmPC = 0.1;
  set_pwmPC(pwmPC);


  parmIndex = findRequiredParameter("-w", commandLine);  // weight matrices
  pwms = read_pwms(commandLine.entry[parmIndex+1]);
  setTRUE(parsed, parmIndex, 2);


  parmIndex = findRequiredParameter("-i1", commandLine); // set 1
  set1FileName = commandLine.entry[parmIndex+1];
  set1 = alloc_analyzedSetType(getChunkDS(set1FileName, strip, outputFile));
  setTRUE(parsed, parmIndex, 2);

  parmIndex = findRequiredParameter("-i2", commandLine); // set 2
  set2FileName = commandLine.entry[parmIndex+1];
  set2 = alloc_analyzedSetType(getChunkDS(set2FileName, strip, outputFile));
  setTRUE(parsed, parmIndex, 2);


  parmIndex = findRequiredParameter("-t", commandLine);  // null training set
  trainDataFileName = commandLine.entry[parmIndex+1];
  trainData = readFile2iLetterVec(trainDataFileName, strip, outputFile);
  setTRUE(parsed, parmIndex, 2);
  if ( (auxParmInd = findOptionalParameter("-m", commandLine)) == -1 ) // Markov order
    nullMarkovOrder = 2; // default order
  else {
    nullMarkovOrder = my_atol(commandLine.entry[auxParmInd+1]);
    checkRangeI(nullMarkovOrder, 0, INT_MAX, "-m nullMarkovOrder");
    setTRUE(parsed, auxParmInd, 2);
  }
  if ( (auxParmInd = findOptionalParameter("-totalPseusoCount", commandLine)) == -1 ) // pseudocount
    pseudoCount = 1.0; // default PC
  else {
    pseudoCount = my_atod(commandLine.entry[auxParmInd+1]);
    checkRangeD(pseudoCount, 0, DBL_MAX, "-totalPseusoCount");
    setTRUE(parsed, auxParmInd, 2);
  }
  setNullTrainSeq(trainData, nullMarkovOrder, pseudoCount);
  FREE_ATOMS_VEC(trainData); // once the Markov model was learned the train data can be freed


  if ( (parmIndex = findOptionalParameter("-siteThresholdFile", commandLine)) > -1 ) { // thresholds
    set_siteThresholdsFromFile(commandLine.entry[parmIndex+1], pwms); // read from file and set
    setTRUE(parsed, parmIndex, 2);
    if ( findOptionalParameter("-siteThresholdLearnedFrom", commandLine) > -1 )
      errorMessage("You have to choose between loading thresholds (-siteThresholdFile) and learning them (-siteThresholdLearnedFrom)");
  }
  else { // the threshold will be computed on the fly
    if ( (parmIndex = findOptionalParameter("-siteThresholdLearnedFrom", commandLine)) > -1 ) { // explicit parameters
      sitePercentThresh = my_atod(commandLine.entry[parmIndex+1]);
      checkRangeD(sitePercentThresh, 0, 1, "-siteThresholdLearnedFrom: percentage");
      if ( (siteThresholdBlockType = findStringOnce(commandLine.entry[parmIndex+2],  siteThresholdBlockTypeName)) == 0 ) { // nullMarkovBlock
	nParms = 4;
	siteThreshTrainSize = my_atol(commandLine.entry[parmIndex+3]);
	checkRangeI(siteThreshTrainSize, 1, INT_MAX, "-siteThresholdLearnedFrom:nullMarkovBlock size ");
      } // nullTrainFile
      else {
	nParms = 3;
	if (nullTrainData.len == 0) // store it if it hasn't already been done
	  nullTrainData = readFile2iLetterVec(trainDataFileName, strip, NULL);
      }
    }
    else { // default threshold generation parameters
      nParms = 0;
      siteThresholdBlockType = 0; // Markov
      sitePercentThresh = 0.01;
      siteThreshTrainSize = 1000;
    }
    setTRUE(parsed, parmIndex, nParms);
    set_threshGenerationData(sitePercentThresh, siteThreshTrainSize, siteThresholdBlockType);
  }


  if ( (parmIndex = findOptionalParameter("-v", commandLine)) > -1 ) { // overlap
    overlapThreshold = my_atod(commandLine.entry[parmIndex+1]);
    checkRangeD(overlapThreshold, 0, 1, "-v overlapThreshold");
    setTRUE(parsed, parmIndex, 2);
  }
  else //default
    overlapThreshold = 1;
  set_overlapThreshold(overlapThreshold);


  if ( (parmIndex = findOptionalParameter("-scanDirecion", commandLine)) > -1 ) {   // scan direction
    parm = commandLine.entry[parmIndex+1];
    setTRUE(parsed, parmIndex, 2);
  }
  else // default
    parm = "both";
  iparm = findStringOnce(parm, scanDirs);
  set_scanDirection(iparm);


  if ( (parmIndex = findOptionalParameter("-permuteInputSequences", commandLine)) > -1 ) {  // permute input sets
    iparm = findStringOnce(commandLine.entry[parmIndex+1], permuteSets);
    setTRUE(parsed, parmIndex, 2);
    if (iparm == 0 || iparm == 2)
      permuteAllSeqs(set1->data);
    if (iparm == 1 || iparm == 2)
      permuteAllSeqs(set2->data);
  }

  if ( (parmIndex = findOptionalParameter("-permutePWM", commandLine)) > -1 ) {  // permute the PWMs
    for (i = 0; i < pwms.len; i++)
      permute_wm(pwms.entry+i);
    setTRUE(parsed, parmIndex, 1);
  }


  if ( (parmIndex = findOptionalParameter("-MCstatScope", commandLine)) > -1 ) {  // MC stats: per seq or set wide
    parm = commandLine.entry[parmIndex+1];
    setTRUE(parsed, parmIndex, 2);
  }
  else // default
    parm = "setWide";
  set_MC_scope(parm);


  parmIndex = findRequiredParameter("-tests", commandLine); // The tests
  auxParmInd = findRequiredParameter("--", commandLine); // bookend
  nextTest = parmIndex + 1;
  while (nextTest < auxParmInd) {
    for (i = nextTest; (i < auxParmInd) && (strcmp(commandLine.entry[i], "+") != 0); i++);
    set_feature_test(commandLine.entry+nextTest, i - nextTest);
    nextTest = i + 1;
  }
  setTRUE(parsed, parmIndex, auxParmInd-parmIndex+1);


  if ( (parmIndex = findOptionalParameter("-MWsigEval", commandLine)) > -1 ) {  // MW evaluation method
    parm = commandLine.entry[parmIndex+1];
    setTRUE(parsed, parmIndex, 2);
  }
  else // default
    parm = "NULL";
  set_MWsigEval(parm);


  if ( (parmIndex = findOptionalParameter("-numRandomSets", commandLine)) > -1 ) {  // number of MC random sets
    set_numRandomSets(commandLine.entry[parmIndex+1]);
    setTRUE(parsed, parmIndex, 2);
  }
  else if (MCtestsRqstd())
    errorMessage("-numRandomSets was not specified but MC was chosen");


  if ( (parmIndex = findOptionalParameter("-MCmodel", commandLine)) > -1 ) {  // How is MC data generated?
    for (i = parmIndex + 2; (i < commandLine.len) && (commandLine.entry[i][0] != '-'); i++); // find where this command ends
    if (set_mcModel(commandLine.entry[parmIndex+1], commandLine.entry+parmIndex+2, i - parmIndex - 2) == 0 )
      // Bootstrap
      if ( (findOptionalParameter("-siteThresholdFile", commandLine) > -1) && (nullTrainData.len == 0) )
	nullTrainData = readFile2iLetterVec(trainDataFileName, strip, NULL); // bootstrap requires this
    setTRUE(parsed, parmIndex, i - parmIndex);
  }
  else if (MCtestsRqstd())
    errorMessage("-MCmodel was not specified but MC was chosen");


  if ( (parmIndex = findOptionalParameter("-set1RandTrainFile", commandLine)) > -1 ) {  // set1 image training file
    if (!blockMCmodel())
      errorMessage("-MCmodel is not compatible with setting set1RandTrainFile");
    setTRUE(parsed, parmIndex, 2);
    set1RandTrainFileName = commandLine.entry[parmIndex+1];
    set1->randomGenerator = setRandTrainFile(set1RandTrainFileName, trainDataFileName, set1FileName, set2FileName);
  }
  else if (MCtestsRqstd() && blockMCmodel())
    errorMessage("-set1RandTrainFile was not specified but block MC was chosen");


  if ( (parmIndex = findOptionalParameter("-set2RandTrainFile", commandLine)) > -1 ) {  // set2 image training file
    if (!blockMCmodel())
      errorMessage("-MCmodel is not compatible with setting set1RandTrainFile");
    setTRUE(parsed, parmIndex, 2);
    set2RandTrainFileName = commandLine.entry[parmIndex+1];
    if (strcmp(set1RandTrainFileName, set2RandTrainFileName) == 0 && (strcmp(set1RandTrainFileName, "_SELF_") != 0))
      set2->randomGenerator = set1->randomGenerator; // better save on space and time: the generator file is shared
    else
      set2->randomGenerator = setRandTrainFile(set2RandTrainFileName, trainDataFileName, set2FileName, set1FileName);
  }
  else if (MCtestsRqstd() && blockMCmodel())
    errorMessage("-set2RandTrainFile was not specified but block MC was chosen");


  if ( (parmIndex = findOptionalParameter("-printTopKperSeq", commandLine)) > -1 ) {  // whether Top K good sites should be printed
    set_printTopK(TRUE);
    set_printSitesTopK(commandLine.entry[parmIndex+1]);
    setTRUE(parsed, parmIndex, 2);
  }


  if ( (parmIndex = findOptionalParameter("-printScoresGTT", commandLine)) > -1 ) {  // whether Top K good sites should be printed
    set_printGTT(TRUE);
    setTRUE(parsed, parmIndex, 1);
  }


  for (i = 0; i < commandLine.len; i++) // report any unparsed strings in the command line
    if (!parsed[i])
      errorMessage(allocAndStrCat("I couldn't parse ", commandLine.entry[i]));

  free(parsed);

}


int findRequiredParameter(char *parm, stringVec commandLine)
     // Returns the index of the unique appearance of parameter in commandLine or aborts
{
  intVec parmIndex;
  int  ind;

  parmIndex = findStringInStringVec(parm, commandLine);
  if (parmIndex.len != 1)
    ERROR(("Parameter %s appears %d times", parm, parmIndex.len))
  else {
    ind = parmIndex.entry[0];
    free(parmIndex.entry);
    return ind;
  }
}


int findOptionalParameter(char *parm, stringVec commandLine)
     // Returns the index of the unique appearance of the optional parameter in commandLine or -1 if none
     // Aborts if more than one appearance
{
  intVec parmIndex;
  int  ind;

  parmIndex = findStringInStringVec(parm, commandLine);
  if (parmIndex.len > 1)
    ERROR(("Parameter %s appears %d times", parm, parmIndex.len))
  else if (parmIndex.len == 0)
    return -1;
  else {
    ind = parmIndex.entry[0];
    free(parmIndex.entry);
    return ind;
  }
}


RandomDsCreatorType * setRandTrainFile(char *MCtrainFile, char *nullTrainFileName, char *setFileName, char *otherSetFileName)
     /*
       Returns a pointer to an instantiated RandomDsCreatorType whose training file depends on MCtrainFile:
        _SELF_ - the set itself
	_NULL_FILE_ - the null training file
	_BOTH_ - both input sets are used
	FILE_NAME - if none of the above is specified this assume to ne the name of a new file
     */
{
  iLetterVec trainData, tempData1, tempData2;
  int i, N=0;

  if (strcmp(MCtrainFile, "_SELF_") == 0)           // _SELF_: the set is its own training file
    trainData = readFile2iLetterVec(setFileName, strip, NULL);
  else if (strcmp(MCtrainFile, "_NULL_FILE_") == 0) // _NULL_FILE_: use the null file
    trainData = readFile2iLetterVec(nullTrainFileName, strip, NULL);
  else if (strcmp(MCtrainFile, "_BOTH_") == 0) {    // _BOTH_: cocantenate the two input sets
    tempData1 = readFile2iLetterVec(setFileName, strip, NULL);
    tempData2 = readFile2iLetterVec(otherSetFileName, strip, NULL);
    trainData = alloc_iLetterVec(tempData1.len + tempData2.len);
    for (i = 0; i < tempData1.len; i++)
      trainData.entry[N++] = tempData1.entry[i];
    for (i = 0; i < tempData2.len; i++)
      trainData.entry[N++] = tempData2.entry[i];
    FREE_ATOMS_VEC(tempData1);
    FREE_ATOMS_VEC(tempData2);
  }
  else
    trainData = readFile2iLetterVec(MCtrainFile, strip, outputFile);         // if none of the words above then this is a new file to be read

  return alloc_RandomDsCreator(trainData);
}


void printGTTinfo(dsSitesStruct setSites, char *setName)
     // Print some information about the GTT sites
{
  fprintf(outputFile, "\nThe %s has %d sites with score >= %f (out of %d sites)", setName, 
	 setSites.slctdSites.aboveT.len, setSites.slctdSites.T, setSites.allSites.len);
  printf("\nThe %s has %d sites with score >= %f (out of %d sites)", setName, 
	 setSites.slctdSites.aboveT.len, setSites.slctdSites.T, setSites.allSites.len);
}
