/*
  Bootstrap a given data while avoiding chopping any sites
*/


#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include "my_types.h"
#include "data_interface.h"
#include "misc_functions.h"
#include "pwms.h"
#include "sitePrtctdBtstrp.h"
#include "markov.h"
#include "motif_scan.h"
#include "rand.h"




static Boolean bootProtectsSites=FALSE; // If TRUE bootstrap will avoid chopping sites
static double percSitesProtect; // the percentage of trainData sites we'd like to preserve on average
static double maxEstBlkSize=1e6; // maximal # of total bps generated when estimating blocks site density
static double densityAccuracy=0.05/3; // roughly the desired accuracy in estimating the density

static int nominalBlockSize; // nominal (requested) bootstrap block size
extern int scanDirection;
extern FILE *outputFile;


void gen_bootBlocksDB(iLetterVec trainData, motif_Struct motif, bootBlocksStruct *bootBlocks)
     // given a motif and a training data generator generate a bootBlocks structure 
{
  siteStruct *sites;
  int nScores, i, endBlock, startBlock, siteLeftPos, siteRightPos, nBlocks, densityEstBlkSize;
  double nScoresGTT=0, siteDensity, nGTTdefaultBlk=0, nGTTprotectedBlk=0;
  iLetterVec ranBlock;

  if (!bootProtectsSites && (bootBlocks->len > 0)) // already computed the blocks (motif doesn't matter)
    return;
  bootBlocks->motifID = motif.id;
  bootBlocks->bootProtectsSites = bootProtectsSites;
  bootBlocks->nominalBlockSize = nominalBlockSize;

  nBlocks = MAX2(0, trainData.len-nominalBlockSize+1);
  bootBlocks->len = nBlocks;

  assert( bootBlocks->entry = (void *) calloc(nBlocks, sizeof(struct blockEntry)) );
  for (i = 0; i < nBlocks; i++) { // default boot blocks
    bootBlocks->entry[i].start = i;
    bootBlocks->entry[i].protectStart = bootBlocks->entry[i].start;
    bootBlocks->entry[i].end =  MIN2(i+nominalBlockSize, trainData.len) - 1;
    bootBlocks->entry[i].protectEnd =  bootBlocks->entry[i].end;
  }

  if (!bootProtectsSites) // no attempt is made to protect sites
    return;

  // Else, first find the density of sites GTT in the training data
  assert( sites = (void *) malloc((scanDirection/2+1) * trainData.len * sizeof(siteStruct)) );
  nScores = get_chunkSites(trainData, motif, scanDirection, 0, 0, sites, FALSE, 0); // get all scores
  for (i = 0; i < nScores; i++) // count the # of sites above threshold
    nScoresGTT += (sites[i].score >= *(motif.siteThreshold));
  siteDensity = nScoresGTT / trainData.len;

  // Next find each block's extension that will guarantee none of its orignal GTT sites is chopped
  // The next two blocks of code assume sites are sorted by occurrences
  for (i = 0; i < nScores; i++) // find length required to protect original forward sites
    if ((sites[i].score >= *(motif.siteThreshold)) && !sites[i].rev) { // forward site GTT
      siteLeftPos = sites[i].leftChunkOffset;
      siteRightPos = siteLeftPos + sites[i].motifSpan - 1;
      for (endBlock = MAX2(nominalBlockSize-1,siteLeftPos); endBlock < siteRightPos; endBlock++) {
	if ((startBlock = endBlock - nominalBlockSize + 1) > siteLeftPos)
	  break;
	bootBlocks->entry[startBlock].protectEnd = siteRightPos;
      }
    }
  for (i = nScores-1; i >= 0; i--) // find length and start required to protect original RC sites
    if ((sites[i].score >= *(motif.siteThreshold)) && sites[i].rev) { // RC site GTT
      siteLeftPos = sites[i].leftChunkOffset;
      siteRightPos = siteLeftPos + sites[i].motifSpan - 1;
      for (startBlock = MIN2(nBlocks-1,siteRightPos); startBlock > siteLeftPos; startBlock--) {
	if (startBlock + nominalBlockSize - 1 < siteRightPos)
	  break;
	bootBlocks->entry[startBlock].protectStart = siteLeftPos;
      }
    }
  free(sites);

  // Finally find and set percSitesProtect which is the probability that a block would be protected
  // The goal is to set it so that site density will remain roughly the same as in the training data
  densityEstBlkSize = MIN2(maxEstBlkSize, (int) pow(1/densityAccuracy,2) * (1-siteDensity) / siteDensity);

  percSitesProtect = 0; // global used in gen_ranBootBlock: no block is protected
  ranBlock = gen_ranBootBlock(trainData, *bootBlocks, densityEstBlkSize);
  nGTTdefaultBlk = get_chunkSites(ranBlock, motif, scanDirection, 0, 0, (siteStruct *) NULL, TRUE, *(motif.siteThreshold));
  FREE_ATOMS_VEC(ranBlock);

  percSitesProtect = 1; // global used in gen_ranBootBlock: all blocks are protected
  ranBlock = gen_ranBootBlock(trainData, *bootBlocks, densityEstBlkSize);
  nGTTprotectedBlk = get_chunkSites(ranBlock, motif, scanDirection, 0, 0, (siteStruct *) NULL, TRUE, *(motif.siteThreshold));
  FREE_ATOMS_VEC(ranBlock);

  if (siteDensity <= nGTTdefaultBlk / densityEstBlkSize) { // can't get any lower than not protectnig sites
    percSitesProtect = 0;
    fprintf(outputFile, "NOTE: site density (%g) is lower than the unprotected bootstrap site density (%g)\n",
	    siteDensity, nGTTdefaultBlk / densityEstBlkSize);
  }
  else if (siteDensity >= nGTTprotectedBlk / densityEstBlkSize) // can't get any higher than protectnig all sites
    percSitesProtect = 1;
  else // interpolate linearly
    percSitesProtect = (siteDensity*densityEstBlkSize - nGTTdefaultBlk) / (nGTTprotectedBlk - nGTTdefaultBlk);
}


iLetterVec gen_ranBootBlock(iLetterVec trainData, bootBlocksStruct bootBlocks, int len)
     // Return (pick really) a random bootstrap block
{
  iLetterVec ranBlock;
  int iBlock, i, blockLen=0, dbBlockEnd;
  unsigned int dbBlockStart;

  ranBlock = alloc_iLetterVec(len);
  while (blockLen < len) {
    iBlock = INT_RAND % bootBlocks.len;
    if (FLOAT_RAND <= percSitesProtect) { // choose block size that protects sites
      dbBlockStart = bootBlocks.entry[iBlock].protectStart;
      dbBlockEnd = bootBlocks.entry[iBlock].protectEnd;
    }
    else { // choose nominal block size
      dbBlockStart = bootBlocks.entry[iBlock].start;
      dbBlockEnd = bootBlocks.entry[iBlock].end;
    }
    for (i = dbBlockStart; (i <= dbBlockEnd) && (blockLen < len); i++)
      ranBlock.entry[blockLen++] = trainData.entry[i];
  }
  return ranBlock;
}


// ----------- Setting user input -----------------


void set_bootParms(Boolean bootProtectsSitesValue, int nominalBlockSizeValue) {
  bootProtectsSites = bootProtectsSitesValue;
  nominalBlockSize = nominalBlockSizeValue;
}


// ---------------  memory management ------------------


void free_bootBlocks(bootBlocksStruct *bootBlocks)
     // free an allocated bootBlocks structure
{
  if (bootBlocks->entry != NULL)
    free(bootBlocks->entry);
  bootBlocks->len = 0;
}
