#define READ_BUFFER_INC 10000	/* Increment size for the length of the buffer into which
				   the sequences are read */
#define N_SEQS_INC 100		/* Incremental number of sequences for which memory is
				   allocated */
#define MAX_HEADER_LENGTH 300	/* How much of the header will be kept  */
#define MAX_STRIP_LENGTH 100	/* Exceed this limit and this potential bug would be the 
				   least of your worries...  */

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "my_types.h"
#include "misc_functions.h"
#include "rand.h"
#include "data_interface.h"


static int move_to_next_seq_start(FILE *fp, cSeqStruct *p_c_seq);
static int read_seq_into_buffer(FILE *fp, cSeqStruct *p_c_seq);
static void print_char_or_space(char c, int i, int lb, int up);
static int char_compare(const char *i, const char *j);
static int retrieveSeqChar(chunksSeq seq, int pos);


static unsigned int DATA_SET_ID=0; // a global data set identifier used to associate other data with the set




cSeqVec read_seqs(char *input_file_name)
{
  FILE *fp;
  cSeqVec c_seqs;
  int n_seqs=0, n_max_seqs=0;

  if ( (fp = fopen(input_file_name, "r")) == NULL )
  {
    printf( "File %s does not exist!!!\n", input_file_name);
    exit(-1);
  }
  
  c_seqs.entry = NULL;
  while (!feof(fp)) {
    if (n_max_seqs <= n_seqs)
      assert( c_seqs.entry = realloc(c_seqs.entry, (n_max_seqs+=N_SEQS_INC)*sizeof(cSeqStruct)) );
    if (move_to_next_seq_start(fp, &(c_seqs.entry[n_seqs])) == 0)
      break;
    if (read_seq_into_buffer(fp, &(c_seqs.entry[n_seqs])) == 0) {
      printf( "Sequence %d has 0 length!!!\n", n_seqs+1);
	  continue;
    }
   // assert( c_seqs.entry[n_seqs].body.entry = realloc(c_seqs.entry[n_seqs].body.entry,
					      //c_seqs.entry[n_seqs].body.len * sizeof(char)) );
     n_seqs++;
  }
  fclose(fp);
  c_seqs.len = n_seqs;
  return c_seqs;
}


    
int move_to_next_seq_start(FILE *fp, cSeqStruct *p_c_seq)
{
  int i, h_len=0;
  char c;

  assert (p_c_seq->header = malloc(MAX_HEADER_LENGTH * sizeof(char)) );
  while (!feof(fp)  &&  ((c=getc(fp)) != '>'))
    ;
  if (feof(fp))
    return 0;

  p_c_seq->header[h_len++] = '>';
  while (!feof(fp)  &&  ((c = getc(fp)) != '\n'))
    if (h_len < MAX_HEADER_LENGTH-1)
      p_c_seq->header[h_len++] = c;

  for (i = h_len; i < MAX_HEADER_LENGTH; i++)
    p_c_seq->header[i] = ' ';
  return (p_c_seq->header_len = h_len);
}
    
int read_seq_into_buffer(FILE *fp, cSeqStruct *p_c_seq)
/*
  Any !isspace character is allowed to pass through
*/
{
  int max_read_buffer=-1, seqlen=0;
  char c;

  p_c_seq->body.entry = NULL;
  while ( ((c=getc(fp)) != '>') && !feof(fp) ) {
    if (!isspace(c)) {
      if (seqlen > max_read_buffer)
        assert( p_c_seq->body.entry = realloc(p_c_seq->body.entry,
					(max_read_buffer+=READ_BUFFER_INC) * sizeof(char)) );
       p_c_seq->body.entry[seqlen++] = c;
    }
  }
  if (c == '>')
    ungetc(c, fp);
  return (p_c_seq->body.len = seqlen);
}


iSeqVec filter_sequences(cSeqVec c_seqs, char i_strip[], int report_gaps)
/*
  Filtering so that only the letters in i_strip survive. If report_gaps is true, then
  gaps[seq][j] will denote the gap between fseqs[seq][j] and fseqs[seq][j+1] in the
  original sequence.
*/
{
  int seq, i, j, last_i, lstrip, iLet;
  char c;
  iSeqVec f_seqs;
  iSeqStruct *p_i_seq;
  cSeqStruct *p_c_seq;
  unsigned char *dic;
  char *strip;
	 
  assert( dic = calloc( 256, sizeof(unsigned char)) );	/* translating dictionary */
  lstrip = strlen(i_strip);
  assert( strip = malloc((lstrip+1) * sizeof(char)) );
  strcpy(strip, i_strip);
  for (i = 0; i < lstrip; i++)
    strip[i] = tolower(strip[i]);
  qsort(strip, (size_t) lstrip, sizeof (char), (void *) char_compare );
  for (i = 0; i < lstrip; i++)  /* reserving 0 for non-translation and assuming lstrip < 256 */
    dic[(unsigned char) strip[i]] = i + 1;
  f_seqs.len = c_seqs.len;
  f_seqs.strip = strip;
  f_seqs.report_gaps = report_gaps;
  
  assert( f_seqs.entry = calloc(c_seqs.len, sizeof(iSeqStruct)) );
   if (report_gaps) {
	   for(i=0;i<c_seqs.len;i++){
		f_seqs.entry[i].num_gaps = 0;
		assert( f_seqs.entry[i].gap_len = (unsigned short *)calloc(c_seqs.entry[i].body.len, sizeof(unsigned short)) );
	   }
  }

  for (seq = 0; seq < c_seqs.len; seq++) {
    p_c_seq = c_seqs.entry+seq;	/* aesthetics */
    p_i_seq = f_seqs.entry+seq;	/* same */
			/* allocate and copy the header */
    assert( p_i_seq->header = malloc(MAX_HEADER_LENGTH * sizeof(char)) );
    for (i = 0; i < MAX_HEADER_LENGTH; i++)
      p_i_seq->header[i] = p_c_seq->header[i];
    p_i_seq->header_len = p_c_seq->header_len;
			/* allocate and "translate" body minding gaps if so requested */
    p_i_seq->body = alloc_iLetterVec(p_c_seq->body.len);
    last_i = -1;
    for (i = 0, j=0; i < p_c_seq->body.len; i++) {
      c = tolower(p_c_seq->body.entry[i]);
      if ((iLet = dic[(unsigned char) c]) > 0) {
        p_i_seq->body.entry[j++] = iLet-1;
        if (report_gaps) {
	  if (i-last_i-1) {
	    p_i_seq->num_gaps++;
	    p_i_seq->gap_len[j - 1] = i-last_i-1;
	  }
          last_i = i;
	}
      }
    }
    realloc_iLetterVec(&(p_i_seq->body), j);
  }
  return f_seqs;
}


double *nucl_frequency(iSeqVec seqs, int please_print, FILE *outfile)
/*
  Computing the frequency of each of the characters in the strip.
*/
{
  int seq, i, lstrip, total=0;
  double *freqs;

  lstrip = strlen(seqs.strip);
  assert( freqs = calloc(lstrip, sizeof(double)) );

  for (seq = 0; seq < seqs.len; seq++) {
    total += seqs.entry[seq].body.len;
    for (i = 0; i < seqs.entry[seq].body.len; i++)
      freqs[seqs.entry[seq].body.entry[i]]++;
  }
  
  if (please_print) {
    fprintf(outfile, "\nThere are %d nucl in %d sequences, distributed as follows:\n", total,
             seqs.len);
    for (i = 0; i < lstrip; i++) {
      freqs[i] /= total;
      fprintf(outfile, "%c - %3.2f , ", seqs.strip[i], freqs[i]);
    }
    fprintf(outfile, "\n\n");
  }
  return freqs;
}


void print_a_line(iSeqStruct i_seq, char *strip, int at, int len, int padding, int header)
     /*
       prints the "len" characters of the sequence "i_seq" starting at position "at"
       (translated accroding to "strip") and pads it with padding characters before
       and after - useful for printing alignments.
       Note that no "\n" are used.
       If "header" >0 then the first "header" characters of the header are printed first.
       IMPORTANT: gaps are completely ignored though they really shouldn't be
     */
{
  int i;

  if (header > 0) {
    for (i = 0; i < MIN2(header,MAX_HEADER_LENGTH); i++)
      printf("%c", i_seq.header[i]);
    printf(":%4d)", at+1);
  }
  for (i = at-padding; i < at; i++)
    print_char_or_space(strip[i_seq.body.entry[i]], i, 0, i_seq.body.len-1);
  printf("   ");

  for (i = at; i < at+len; i++)
    print_char_or_space(strip[i_seq.body.entry[i]], i, 0, i_seq.body.len-1);
  printf("   ");

  for (i = at+len; i < at+len+padding; i++)
    print_char_or_space(strip[i_seq.body.entry[i]], i, 0, i_seq.body.len-1);

  return;
}

void print_char_or_space(char c, int i, int lb, int up)
     /*
       print "c" if lb <= c <= up, otherwise prints " "
     */
{
    if ((i >= lb) && (i <= up))
      printf("%c", c);
    else
      printf(" ");
    return;
}

int char_compare(const char *i, const char *j)
{
	return ((*i) - (*j));
}



/*
 * Function chunk_decomp decomposes a sequence into sequence chunks w.r.t. gaps.
 */
chunksSeqVec chunk_decomp(iSeqVec i_seqs, FILE * outfile)
{
	int i,j,k,l,n_chunks, chunkStartPos;
	chunksSeqVec seqs;
	chunkStruct *p_chunks;

	seqs = alloc_chunksSeqVec(i_seqs.len);
	for(i=0;i<seqs.len;i++)
	{
		seqs.entry[i].header_len=i_seqs.entry[i].header_len;
		assert(seqs.entry[i].header=(char *)malloc(seqs.entry[i].header_len*sizeof(char)));
		for(j=0;j<seqs.entry[i].header_len;j++)
			seqs.entry[i].header[j]=i_seqs.entry[i].header[j];
		seqs.entry[i].total_len=i_seqs.entry[i].body.len;
		
		if(i_seqs.report_gaps==0)//gaps are omitted
		{
			seqs.entry[i].num_chunks = 1;
			assert( p_chunks = (chunkStruct *)malloc(sizeof(chunkStruct)) );
			p_chunks->body = alloc_iLetterVec(i_seqs.entry[i].body.len);
			for(j = 0; j < i_seqs.entry[i].body.len; j++)
			  p_chunks->body.entry[j] = i_seqs.entry[i].body.entry[j];					
		}
		else {
			n_chunks = i_seqs.entry[i].num_gaps + 1;
		  	assert( p_chunks = (chunkStruct *)calloc(n_chunks,sizeof(chunkStruct)) );
			l=0;
			chunkStartPos = 0;
			for(j = 0; j < n_chunks; j++)
			  {
			  p_chunks[j].startPos = chunkStartPos;
			  p_chunks[j].body = alloc_iLetterVec(i_seqs.entry[i].body.len);
			  k=0;
			  do
			    p_chunks[j].body.entry[k++] = i_seqs.entry[i].body.entry[l++];
			  while ((l < i_seqs.entry[i].body.len) && (i_seqs.entry[i].gap_len[l]==0));
			  chunkStartPos += k;
			  if(l < i_seqs.entry[i].body.len)
			    chunkStartPos +=  i_seqs.entry[i].gap_len[l];
			  realloc_iLetterVec(&(p_chunks[j].body), k);
			  if(outfile != 0)
			    fprintf(outfile,"len=%d char%d%d\n",p_chunks[j].body.len,p_chunks[j].body.entry[0],p_chunks[j].body.entry[1]);
			  }
			seqs.entry[i].num_chunks = n_chunks; 
		}
		seqs.entry[i].chunks = p_chunks;
	}
	seqs.strip = allocAndStrCpy(i_seqs.strip);
	return(seqs);
}


chunksSeqVec getChunkDS(char *fileName, char *strip, FILE *outfile)
     // Returns a chunksSeqVec filtered out of the file
{
  cSeqVec cSV;
  iSeqVec iSV;
  chunksSeqVec ds;

  cSV = read_seqs(fileName);
  iSV = filter_sequences(cSV, strip, TRUE);
  if (outfile != NULL) {
    fprintf(outfile, "\nIn File %s:", fileName);
    nucl_frequency(iSV, 1, outfile);
  }
  ds = chunk_decomp(iSV, NULL);
  free_c_seqs(cSV);
  free_i_seqs(iSV);
  return ds;
}


iLetterVec readFile2iLetterVec(char *fileName, char *strip, FILE *outfile)
     // Reads the input set from fileName and returns it as a iLetterVec
{
  cSeqVec cSV;
  iSeqVec iSet;
  int iSeq, N=0, j;
  iLetterVec iLV;

  cSV = read_seqs(fileName);
  iSet = filter_sequences(cSV, strip, 0); // filter training set (0 = ignore gaps)
  for (iSeq = 0; iSeq < iSet.len; iSeq++)
    N += iSet.entry[iSeq].body.len;
  iLV = alloc_iLetterVec(N);
  N = 0;
  for (iSeq = 0; iSeq < iSet.len; iSeq++)
    for (j = 0; j < iSet.entry[iSeq].body.len; j++)
      iLV.entry[N++] = iSet.entry[iSeq].body.entry[j];

  if (outfile != NULL) {
    fprintf(outfile, "\nIn Training file %s:", fileName);
    nucl_frequency(iSet, 1, outfile);
  }

  free_c_seqs(cSV);
  free_i_seqs(iSet);

  return iLV;
}


unsigned int chunkSeqVecTotalLen(chunksSeqVec set)
     // Returns the total number of non-gap residues in set
{
  int iSeq;
  unsigned int tot=0;

  for (iSeq = 0; iSeq < set.len; iSeq++)
    tot += set.entry[iSeq].total_len;
  return tot;
}


chunksSeqVec iLetterVec2chunksSeqVec(iLetterVec block, chunksSeqVec moldSet)
     // Massage block into a chunksSeqVec in the mold of moldSet
{
  int iSeq, iChunk, n=0, i;
  chunksSeqVec set;
  chunksSeq *seq;
  chunkStruct *chunk;

  if (block.len != chunkSeqVecTotalLen(moldSet))
    errorMessage("In iLetterVec2chunksSeqVec: the lengths of the inputs differ");
  set = alloc_chunksSeqVec(moldSet.len);
  set.strip = allocAndStrCpy(moldSet.strip);
  for (iSeq = 0; iSeq < set.len; iSeq++) {
    seq = set.entry+iSeq;       // readability
    *seq = moldSet.entry[iSeq]; // just being lazy
    seq->header = allocAndStrCpy(moldSet.entry[iSeq].header);
    assert( seq->chunks = (void *) malloc(seq->num_chunks * sizeof(chunkStruct)) );
    for (iChunk = 0; iChunk < seq->num_chunks; iChunk++) {
      chunk = seq->chunks + iChunk;
      *chunk = moldSet.entry[iSeq].chunks[iChunk];
      assert( chunk->body.entry = (void *) malloc(chunk->body.len * sizeof(iLetter)) );
      for (i = 0; i < chunk->body.len; i++)
	chunk->body.entry[i] = block.entry[n++];
    }
  }
  return set;
}


iLetterVec chunkSeq2iLetterVec(chunksSeq cSeq)
     // Returns an allocated iLetterVec with all the sequence letters
{
  iLetterVec seqV;
  int iChunk, n=0, i;

  seqV = alloc_iLetterVec(cSeq.total_len);
  for (iChunk = 0; iChunk < cSeq.num_chunks; iChunk++)
    for (i = 0; i < cSeq.chunks[iChunk].body.len; i++)
      seqV.entry[n++] = cSeq.chunks[iChunk].body.entry[i];

  return seqV;
}


void iLetterVec2chunks(iLetterVec block, chunksSeq *cSeq)
     // Uses block to fill the chunks of cSeq
{
  int iChunk, n=0, i;

  if (block.len != cSeq->total_len)
    errorMessage("In iLetterVec2chunks: the lengths of the inputs differ");
  for (iChunk = 0; iChunk < cSeq->num_chunks; iChunk++)
    for (i = 0; i < cSeq->chunks[iChunk].body.len; i++)
      cSeq->chunks[iChunk].body.entry[i] = block.entry[n++];
}


chunksSeq duplicateChunkSeq(chunksSeq cSeq)
     // Duplicate the input chunksSeq cSeq
{
  chunksSeq cSeqCopy;
  int iChunk, n=0, i;
  chunkStruct *chunk;

  cSeqCopy = cSeq; // just being lazy
  cSeqCopy.header = allocAndStrCpy(cSeq.header);
  assert( cSeqCopy.chunks = (void *) malloc(cSeqCopy.num_chunks * sizeof(chunkStruct)) );
  for (iChunk = 0; iChunk < cSeqCopy.num_chunks; iChunk++) {
    chunk = cSeqCopy.chunks + iChunk;
    *chunk = cSeq.chunks[iChunk];
    assert( chunk->body.entry = (void *) malloc(chunk->body.len * sizeof(iLetter)) );
    for (i = 0; i < chunk->body.len; i++)
      chunk->body.entry[i] = cSeq.chunks[iChunk].body.entry[i];
  }
  return cSeqCopy;
}


chunksSeqVec alloc_chunksSeqVec(int len)
     // Returns an allocated chunksSeqVec
{
  chunksSeqVec csVec;

  assert( csVec.entry = (chunksSeq *) calloc(len, sizeof(chunksSeq)) );
  csVec.len = len;
  csVec.id = DATA_SET_ID++;
  csVec.strip = NULL;

  return csVec;
}


iLetterVec alloc_iLetterVec(int len)
     // Returns an allocate iLetterVec
{
  iLetterVec vec;

  assert( vec.entry = (void *) calloc(len, sizeof(iLetter)) );
  vec.len = len;
  return vec;
}


void realloc_iLetterVec(iLetterVec *vec, int len)
     // Reallocates an iLetterVec
{
  vec->entry = (void *) my_realloc(vec->entry, len*sizeof(iLetter));
  vec->len = len;
}


void free_c_seqs(cSeqVec c_seqs)
{
  int i;

  for (i = 0; i < c_seqs.len; i++) {
    free(c_seqs.entry[i].body.entry);
    free(c_seqs.entry[i].header);
  }
  free(c_seqs.entry);
}


void free_i_seqs(iSeqVec i_seqs)
{
	int i;
	for(i = 0; i < i_seqs.len;i++){
		if(i_seqs.entry[i].header != 0)
			free(i_seqs.entry[i].header);
		if(i_seqs.entry[i].body.entry != 0)
			free(i_seqs.entry[i].body.entry);
		if(i_seqs.entry[i].gap_len != 0)
			free(i_seqs.entry[i].gap_len);
	}
	free(i_seqs.entry);
}


void free_chunksSeqVec(chunksSeqVec seqs){
	int i;
	
	for(i = 0; i < seqs.len; i++){
		free_chunksSeq(seqs.entry[i]);
	}
	free(seqs.entry);
	if (seqs.strip != NULL)
	  free(seqs.strip);
}


void free_chunksSeq(chunksSeq seq)
{
	int i;

	for(i = 0; i < seq.num_chunks; i++)
	  FREE_ATOMS_VEC(seq.chunks[i].body);
	free(seq.header);
	free(seq.chunks);
}


void permuteChunk(iLetterVec chunk, int *perm)
     //  Permutes chunk in place
{

  int i, len = chunk.len;
  iLetterVec temp;
  
  temp = alloc_iLetterVec(len);
  for(i=0; i< len; i++)
    temp.entry[i] = chunk.entry[i];

  for(i=0; i< len; i++)
    chunk.entry[i] = temp.entry[perm[i]-1]; // perm has values from 1 to N, not 0 to N-1

  free(temp.entry);
}


void permuteAllSeqs(chunksSeqVec set)
     // Permute all the sequences in the set: 
     //  each sequence is permuted separately and chunk boundaries are ignored
{
  int iSeq, *perm=NULL, maxPerm = 0, seqLen, iChunk;
  iLetterVec seqV;
  chunksSeq cSeq;

  for (iSeq = 0; iSeq < set.len; iSeq++) {
    cSeq = set.entry[iSeq];
    seqLen = cSeq.total_len;
    if (seqLen > maxPerm) {
      maxPerm = seqLen;
      assert( perm = realloc(perm, maxPerm*sizeof(int)) );
    }
    rand_perm_prealloc(perm, seqLen);
    seqV = chunkSeq2iLetterVec(cSeq);
    permuteChunk(seqV, perm);
    iLetterVec2chunks(seqV, &cSeq);
    free(seqV.entry);
  }
  free(perm);
}


#define NUM_FLANK_CHARS 15  // number of characters to be printed on each side of the site

void printSite(chunksSeqVec *set, int siteSeqID, int chunkID, int siteLeftPos, Boolean siteRev, double siteScore, int pwm_width, FILE *output)
     /*
       Print the given site to the given output file using the format:
          NUM_FLANK_CHARS site NUM_FLANK_CHARS
       Sites that are RC are printed that way
       Note that the function assume the strip is "ACGT"!
     */
{
  int globalStart, pos, iChar, iBuf=0, *buffer, *tmpBuf, bufferLen, i;
  chunksSeq *seq;
  char *printChar=" -acgt";

  if (strcmp(printChar+2, set->strip) != 0)
    errorMessage("The set strip is not 'acgt' as I expect");

  bufferLen = 2*NUM_FLANK_CHARS + pwm_width;
  assert(buffer = (int *)malloc(bufferLen*sizeof(int)));

  seq = set->entry+siteSeqID;
  globalStart = seq->chunks[chunkID].startPos + siteLeftPos;

  for (pos = globalStart - NUM_FLANK_CHARS; pos < globalStart  + pwm_width + NUM_FLANK_CHARS; pos++)
    buffer[iBuf++] = retrieveSeqChar(*seq, pos);
  
  if (siteRev) {
    assert(tmpBuf = (int *)malloc(bufferLen*sizeof(int)));
    for (iBuf = 0; iBuf < bufferLen; iBuf++) {
      iChar = buffer[bufferLen-iBuf-1];
      tmpBuf[iBuf] = (iChar >= 0) ? 3-iChar : iChar;
    }
    free(buffer);
    buffer = tmpBuf;
  }

  for (iBuf = 0; iBuf < NUM_FLANK_CHARS; iBuf++)
    fprintf(output, "%c", printChar[buffer[iBuf]+2]);
  fprintf(output, " ");
  for (iBuf = NUM_FLANK_CHARS; iBuf < NUM_FLANK_CHARS + pwm_width; iBuf++)
    fprintf(output, "%c", printChar[buffer[iBuf]+2]);
  fprintf(output, " ");
  for (iBuf = NUM_FLANK_CHARS + pwm_width; iBuf < bufferLen; iBuf++)
    fprintf(output, "%c", printChar[buffer[iBuf]+2]);

  fprintf(output, "  Score: %.3g\t Seq/ loc: %3d/%5d ", siteScore, siteSeqID+1, globalStart+1);
  if(siteRev)
    fprintf(output, "reversed strand");
  else
    fprintf(output, "forward strand");
  fprintf(output, "\t'");
  for (i = 0; i < seq->header_len; i++)
    fprintf(output, "%c", seq->header[i]);
  fprintf(output, "'\n");
  free(buffer);
}


int retrieveSeqChar(chunksSeq seq, int pos)
     /*
       Return the numeric code of the character that's at (global) position pos
       Special values the function returns:
         -1 if it concludes this is a gapped position 
	 -2 if pos < 0 or is greater than the *raw* length of seq
       There are more efficient ways to do this but if it's just for printing better KISS
     */
{
  int seqRawLen, iChk;

  seqRawLen = seq.chunks[seq.num_chunks-1].startPos + seq.chunks[seq.num_chunks-1].body.len;
  if (pos < 0 || pos >= seqRawLen)
    return -2;

  for (iChk = 0; iChk < seq.num_chunks-1; iChk++)
    if (seq.chunks[iChk+1].startPos > pos)
      break;

  if (seq.chunks[iChk].startPos + seq.chunks[iChk].body.len <= pos)
    return -1;

  return seq.chunks[iChk].body.entry[pos - seq.chunks[iChk].startPos];
}
