/**************************************************************************
 * FILE: meme-io.c
 * CREATE DATE: 3/5/2001
 * AUTHOR: William Stafford Noble
 * PROJECT: MHMM
 * COPYRIGHT: 2001-2008, WSN
 * DESCRIPTION: Read a collection of motifs from a MEME 3.0 output file.
 **************************************************************************/
#include <assert.h>
#include <libxml/parser.h>
#include <libxml/tree.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "io.h"
#include "meme-io.h"
#include "metameme.h"
#include "mhmm-state.h"
#include "rdb-matrix.h"  // For reading background files
#include "string-list.h"
#include "xml-util.h"
#include "alphabet.h"
#include "hash_alph.h"

char* END_TRANSITION = "*";

/* Maximum allowed width of one input line. */
#define MAX_LINE 10000

/* Initialize global variables associated with SPACER_T. */
char * SPACER_STRS[] = {"invalid", "nrdb", "data"};
int NUM_SPACER_T = 3;

// String that marks the letter frequency section.
char* LETTER_FREQUENCY =
"Background letter frequencies (from";
// tlb; letter frequencies might be from a -bfile
// "Background letter frequencies (from dataset with add-one prior applied):\n";

/* String that marks the beginning of the motif occurrences section. */
/* These have change slightly over time so we maintain               */
/* different version for the sake of compatibility                   */
char* START_OCCURRENCES_1 =
  "<INPUT TYPE = HIDDEN NAME = motif-summary VALUE = \"\n";
char* END_OCCURRENCES_1 = "\">\n";
char* START_OCCURRENCES_2 =
  "<INPUT TYPE = \"HIDDEN\" NAME = \"motif-summary\" VALUE = \"\n";
char* START_OCCURRENCES_3 =
  "<input type=\"hidden\" name=\"motif-summary\" value=\" \n";
char* END_OCCURRENCES_2 = " \">\n";

/* String that marks the end of the motif. */
char* END_OF_MOTIF = "Combined";

#define MAX_XPATH_EXPRESSION 200

/**************************************************************************
 * Replace the elements an array of frequences with the average
 * over complementary bases.
 **************************************************************************/
void average_freq_with_complement(ARRAY_T *freqs) {

  assert(which_alphabet() == DNA_ALPH);

  char *alphabet = get_alphabet(FALSE);
  int a_index = alphabet_index('A', alphabet);
  int t_index = alphabet_index('T', alphabet);
  int g_index = alphabet_index('G', alphabet);
  int c_index = alphabet_index('C', alphabet);
  double at_freq =
    (get_array_item(a_index, freqs) + get_array_item(t_index, freqs)) / 2.0;
  double gc_freq =
    (get_array_item(g_index, freqs) + get_array_item(c_index, freqs)) / 2.0;
  set_array_item(a_index, at_freq, freqs);
  set_array_item(t_index, at_freq, freqs);
  set_array_item(g_index, gc_freq, freqs);
  set_array_item(c_index, gc_freq, freqs);

}

/**************************************************************************
 * Read an emission distribution from a file in MEME -bfile format.
 **************************************************************************/
ARRAY_T *read_background_file(
  char*      bg_filename      // Name of the file to read from.
)
{
  int i, j, alen1, alen2;
  char *alpha1;        // alphabet in file
  char *alpha2;        // target alphabet
  RDB_MATRIX_T *rdb_matrix;
  FILE *bg_file;       /* Pointer to the background file. */
  ARRAY_T *background;      // Array of probs to return.

  /* Open the file for reading. */
  if (open_file(bg_filename, "r", FALSE, "background", "frequencies", &bg_file)
      == 0)
    exit(1);

  // Read in the background file.
  rdb_matrix = read_rdb_matrix(" \t", FALSE, 1, FALSE, NULL, bg_file);

  /* get alphabet from the row names; discard tuples */
  alen1 = get_num_strings(rdb_matrix->row_names);
  alpha1 = (char *)mm_malloc(sizeof(char) * (alen1 + 1));
  for (i=j=0; i<alen1; i++) {
    char *wa = get_nth_string(i, rdb_matrix->row_names);
    if (strlen(wa) > 1) continue;  // ignore tuples
    alpha1[j++] = islower((int)wa[0]) ? toupper(wa[0]) : wa[0];
  }
  alpha1[j] = '\0';
  alen1 = strlen(alpha1);    // length of alphabet in file

  // Get the target alphabet without ambigs.
  alpha2 = get_alphabet(FALSE);
  alen2 = get_alph_size(ALPH_SIZE);  // length of target alphabet

  // Check that alphabets are the same length.
  if (alen1 != alen2)
    die("The -bg alphabet %s should be %s.\n", alpha1, alpha2);

  /* Allocate the background array. */
  background = allocate_array(get_alph_size(ALL_SIZE));

  // Reorder the probabilities in target alphabet order.
  for (i=0; i<alen2; i++) {
    int ii = strchr(alpha1, alpha2[i]) - alpha1;
    if (ii < 0 || ii >= alen1)
      die("The -bg alphabet %s ought to be %s.\n", alpha1, alpha2);
    set_array_item(i, get_matrix_cell(ii, 0, rdb_matrix->matrix), background);
  }

  /* Extend the distribution to account for ambiguous characters. */
  fill_in_ambiguous_chars(FALSE, background);

  /* Close the file. */
  fclose(bg_file);

  free_rdb_matrix(rdb_matrix);
  myfree(alpha1);

  return(background);
}

/***************************************************************************
 * Set a background distribution
 *  - by reading values from a file if filename is given, or
 *  - equal to the NRDB frequencies if filename is NULL.
 *
 ***************************************************************************/
ARRAY_T* get_background(char* bg_filename) {
  ARRAY_T* background;

  if ((bg_filename == NULL) || (strcmp(bg_filename, "nrdb") == 0)) {
    background = allocate_array(get_alph_size(ALL_SIZE));
    get_nrdb_frequencies(background);
    fill_in_ambiguous_chars(FALSE, background);
  } else {
    background = read_background_file(bg_filename);
  }

  if (verbosity > NORMAL_VERBOSE) {
    fprintf(stderr, "Background distribution: ");
    print_array(background, 5, 3, TRUE, stderr);
  }

  return(background);
}

/***********************************************************************
 * Read the version information from a MEME file.
 * Accept only version 3 or 4.
 ***********************************************************************/
static char* read_version(FILE* motif_file)
{
  static char version[20];       /* MEME version number. */
  char        word[MAX_LINE];    /* Buffer for reading. */

  while (TRUE) {
    if (fgets(word, MAX_LINE, motif_file) == NULL) {
      fclose(motif_file);
      die("Can't find MEME version.\n");
    }
    if ((sscanf(word, "MEME version %s", version) == 1) ||
        (sscanf(word, "Meta-MEME version %s", version) == 1)) {
      if (version[0] == '3' || version[0] == '4') {
        break;
      }
      else if ((version[0] == '1') || (version[0] == '2')) {
        fclose(motif_file);
        die("Sorry, MEME version %c output files are no longer supported.\n",
          version[0]);
      }
      else {
        fclose(motif_file);
        die("Unknown MEME version: %s\n", version);
      }
    }
  }
  return(version);
}

/***********************************************************************
 * Read the version information from MEME XML.
 * Accept only version 3.
 * Caller is responsible for freeing returned xmlChar
 ***********************************************************************/
static xmlChar* read_version_from_xml(xmlDocPtr meme_doc) {

  xmlXPathContextPtr xpathCtx = NULL;
  xmlXPathObjectPtr xpathObj = NULL;
  xmlChar* version = NULL;

  xpathCtx = xmlXPathNewContext(meme_doc);
  if(xpathCtx == NULL) {
    die("Error: unable to create new XPath context while reading version.\n");
  }
  xpathObj = xmlXPathEvalExpression(BAD_CAST "/MEME", xpathCtx);
  if(xpathObj == NULL) {
      die("Error: unable to evaluate xpath expression /MEME.\n");
  }
  version = xmlGetProp(xpathObj->nodesetval->nodeTab[0], BAD_CAST "version");
  if (version == NULL) {
    die("Error: missing version attribute in MEME.\n");
  }
  fprintf(stderr, "MEME version is %s\n", version);

  return(version);
}

/***********************************************************************
 * Read the alphabet from a file and allocate the given array.
 ***********************************************************************/
static void read_alphabet
  (FILE*     motif_file)
{
  int  alph_size;        /* Size of the alphabet. */
  char buffer[MAX_LINE]; /* Buffer for reading. */

  alph_size = 0;
  while (alph_size == 0) {
    if (fgets(buffer, MAX_LINE, motif_file) == NULL)
      die("Can't read the alphabet from the motif file.\n");
    if (sscanf(buffer, "ALPHABET= %s", buffer) == 1) {
      alph_size = strlen(buffer);
      set_alphabet(verbosity, buffer);
    }
  }
}

/***********************************************************************
 * Read a MEME file to find out whether both strands are included.
 ***********************************************************************/
static BOOLEAN_T read_strand
  (FILE*     motif_file)
{
  char buffer[MAX_LINE]; // Buffers for reading.
  char plus[MAX_LINE];
  char minus[MAX_LINE];
  int  num_read; // Number of items read from this line.

  while (1) {
    if (fgets(buffer, MAX_LINE, motif_file) == NULL) {
      die("Can't find strand information in the motif file.\n");
    }
    num_read = sscanf(buffer, "strands: %s %s", plus, minus);
    if (num_read == 1) {
      assert(strcmp(plus, "+") == 0);
      return(FALSE); // Only has forward strand.
    }
    else if (num_read == 2) {
      assert((strcmp(plus, "+") == 0) && (strcmp(minus, "-") == 0)) ;
      return(TRUE);  // Has forward and reverse strand.
    }
  }

  // This should be unreachable.
  die("Can't find strand information in the motif file.\n");
  return(FALSE);
}

/***********************************************************************
 * Read MEME XML to find out whether both strands are included.
 ***********************************************************************/
static BOOLEAN_T read_strand_from_xml(xmlXPathContextPtr xpath_ctxt) {

  xmlXPathObjectPtr xpath_obj = NULL;
  xmlChar* strand = NULL;
  BOOLEAN_T result = FALSE;

  xpath_obj = xpath_query(xpath_ctxt, "/MEME/model/strands");
  strand = xmlXPathCastNodeToString(xpath_obj->nodesetval->nodeTab[0]);
  if (strncmp("both", (const char *) strand, 5) == 0) {
    result = TRUE;
  }
  xmlFree(strand);
  xmlXPathFreeObject(xpath_obj);
  return result;
}


/***********************************************************************
 * Read a set of letter frequencies from MEME output.
 ***********************************************************************/
static ARRAY_T* read_freqs
  (FILE *   infile)      // An open file in MEME output format.
{
  char     buffer[MAX_LINE]; /* Buffer for reading from the MEME file. */
  char 	   letter[2];        /* Buffer for reading alphabet. Must be >= 2 bytes! */
  int      alph_size;
  int      i;
  ATYPE    value;
  ARRAY_T* return_value;

  // Allocate the array.
  return_value = allocate_array(get_alph_size(ALL_SIZE));

  /* Find the right line. */
  while (TRUE) {
    if (fgets(buffer, MAX_LINE, infile) == NULL) {
      die("Can't find letter frequencies.");
    }
    if (strstr(buffer, LETTER_FREQUENCY) != NULL) {
      break;
    }
  }

  /* Read letter-frequency pairs. */
  alph_size = get_alph_size(ALPH_SIZE);
  for (i = 0; i < alph_size; i++) {
    if (fscanf(infile, "%1s", letter) != 1) {
      die("Error reading frequencies: %c %d\n", letter, i);
    }
    if (fscanf(infile, PROB_SCAN, &value) != 1) {
      die("Error reading frequencies: %c %d %5.3f\n", letter, i,
        get_array_item(i, return_value));
    }
    set_array_item(i, value, return_value);

    /* Make sure the alphabet corresponds to the dataset's alphabet. */
    assert(letter[0] == get_alph_char(i));
  }

  /* Make sure the frequencies add up to 1.0. */
  normalize_subarray(0, alph_size, 0.0, return_value);

  /* Fill in ambiguous characters. */
  fill_in_ambiguous_chars(FALSE, return_value);
  return(return_value);
}

/***********************************************************************
 * Read a motif frequency matrix.
 ***********************************************************************/
static void read_letter_freq(
  FILE *    infile,   /* An open file in MEME output format with the
                         pointer before the requested motif. IN */
  ARRAY_T* background, /* Pointer to array of bg freqs. */
  double pseudocount, /* pseudocount IN */
  MOTIF_T * a_motif   /* The motif. OUT */
) {
  char     buffer[MAX_LINE];    /* Buffer for reading. */
  ARRAY_T* these_freqs;
  int      alph_size;           /* Size of the alphabet, as read from file. */
  int      i;
  int      j;
  float    evalue;
  int      unused_value;

  // Find the motif block.
  for (buffer[0] = '\0'; strstr(buffer, "BL   MOTIF ") == NULL; ) {
    if (fgets(buffer, MAX_LINE, infile) == NULL) {
      die("Can't find motif block in motif %s.\n", get_motif_id(a_motif));
    }
  }

  // Read the number of sites.
  char motif_id[MAX_MOTIF_ID_LENGTH + 1];
  if (sscanf(buffer, "BL   MOTIF %s width=%d seqs=%lf",
       motif_id, &(a_motif->length), &(a_motif->num_sites)) != 3) {
    die("Error reading number of sites in motif %d.\n[%s]\n",
      a_motif->id, buffer);
  }

  // Make sure we aren't looking at the wrong motif due to an error
  // in the input file format.
  if (strcmp(motif_id, a_motif->id)) {
    die("Error in motif block. Motif id %s in letter freq. header"
        " doesn't match expected motif id %s.\n", motif_id, a_motif->id);
  }

  /* Read through the motif until we reach the letter-frequency matrix. */
  for (buffer[0] = '\0'; strstr(buffer, "letter-probability matrix: ")
    == NULL; ) {
    if (fgets(buffer, MAX_LINE, infile) == NULL) {
      die("Can't find frequency matrix in motif %d.\n", a_motif->id);
    }
  }

  /* Read the motif parameters, dealing with old MEME format. */
  if (sscanf(buffer,
       "letter-probability matrix: alength= %d w= %d n= %d E= %f",
       &alph_size, &(a_motif->length), &unused_value, &evalue)
      == 4) {

    // Issue a warning.
    static BOOLEAN_T first_time = TRUE;
    if (first_time) {
      fprintf(stderr, "\nWarning: This is an old MEME file that contains\n");
      fprintf(stderr, "posterior probabilities rather than frequencies.\n");
      fprintf(stderr, "Meta-MEME will still work, but it would be better\n");
      fprintf(stderr, "to run an updated version of MEME on your data.\n");
      first_time = FALSE;
    }

  }
  else if (sscanf(buffer,
    "letter-probability matrix: alength= %d w= %d nsites= %lf E= %f",
    &alph_size, &(a_motif->length), &(a_motif->num_sites), &evalue)  != 4) {
      die("Error reading frequency matrix parameters in motif %s.\n[%s]\n",
        a_motif->id, buffer);
  }
  // Sanity checks for motif parameters
  if (a_motif->num_sites <= 0.0) {
    die(
      "\nThe record for motif %s indicated the number of sites was %lf.\n"
      "The number of sites for a motif must be a positive number.\n",
      a_motif->id,
      a_motif->num_sites
    );
  }
  if (a_motif->length < 0) {
    die(
      "\nThe record for motif %s indicated the width of the motif was %d.\n"
      "The width of a motif must be a positive number.\n",
      a_motif->id,
      a_motif->length
    );
  }
  a_motif->evalue = evalue;

  /* Allocate memory for the matrix. */
  a_motif->freqs = allocate_matrix(a_motif->length, get_alph_size(ALL_SIZE));

  /* Read the letter frequency matrix. */
  for (i = 0; i < a_motif->length; i++) {
    these_freqs = get_matrix_row(i, a_motif->freqs);

    double row_sum = 0.0;
    for (j = 0; j < alph_size; j++) {
      double letter_freq = 0.0;
      double bg_letter_freq = get_array_item(j, background);
      if (fscanf(infile, PROB_SCAN, &letter_freq) != 1) {
        die("Error reading frequency at [%d][%d].\n", i, j);
      }
      // Adjust freq using pseudocount
      letter_freq =
        ((a_motif->num_sites * letter_freq) + pseudocount * bg_letter_freq)
        / (a_motif->num_sites  + pseudocount);
      row_sum += letter_freq;
      set_array_item(j, letter_freq, these_freqs);
    }
    //printf("\nroswum=%e", row_sum);

    /* Normalize the first alph_size positions. (MEME prints six
       digits of precision.) */
    normalize_subarray(0, alph_size, 0.00001, these_freqs);

    /* Compute values for ambiguous characters. */
    fill_in_ambiguous_chars(FALSE, these_freqs);
  }
}

/***********************************************************************
 * Read a motif frequency matrix from MEME XML into a motif.
 ***********************************************************************/
static void read_letter_freq_from_xml(
  xmlXPathContextPtr xpath_ctxt,  // XML document stream IN
  ARRAY_T* background, // Pointer to array of bg freqs. IN
  double pseudocount,  // Pseudoount to be applied to motif freq. IN
  char* motif_id,      // XML id attribute for motif IN
  MOTIF_T* motif       // Motif structure OUT
) {

  int      motif_col;
  int      alpha_col;
  ARRAY_T* these_freqs;
  ATYPE    value;
  xmlXPathObjectPtr xpath_obj;
  char xpath_expression[MAX_XPATH_EXPRESSION];

  motif->freqs = allocate_matrix(motif->length, get_alph_size(ALL_SIZE));

  // Read the letter frequency matrix,
  // one array per position in the motif
  for (motif_col = 0; motif_col < motif->length; motif_col++) {

    these_freqs = get_matrix_row(motif_col, motif->freqs);

    // Build the XPATH expression to get one row of the freq matrix.
    snprintf(
      xpath_expression,
      MAX_XPATH_EXPRESSION,
      "/MEME/motifs/motif[@id='%s']/probabilities/alphabet_matrix/"
      "alphabet_array[position()=%d]/value",
      motif_id,
      motif_col + 1
    );
    xpath_obj = xpath_query(xpath_ctxt, xpath_expression);
    // The number of columns in the freq matrix should should
    // match the size of the alphabet.
    assert(motif->alph_size == xpath_obj->nodesetval->nodeNr);

    // Fill in the values of the row from the XPATH result.
    for (alpha_col = 0; alpha_col < motif->alph_size; alpha_col++) {
      double bg_letter_freq = get_array_item(alpha_col, background);
      xmlNodePtr currValueNode = xpath_obj->nodesetval->nodeTab[alpha_col];
      value = xmlXPathCastNodeToNumber(currValueNode);
      value = ((motif->num_sites * value) + pseudocount * bg_letter_freq)
        / (motif->num_sites  + pseudocount);
      set_array_item(alpha_col, value, these_freqs);
    }

    // Normalize the first alph_size positions. (MEME prints six
    // digits of precision).
    normalize_subarray(0, motif->alph_size, 0.00001, these_freqs);

    /* Compute values for ambiguous characters. */
    fill_in_ambiguous_chars(FALSE, these_freqs);

    xmlXPathFreeObject(xpath_obj);

  }


}

/***********************************************************************
 * Read the next motif from a file into a motif data structure.
 *
 * RETURN: Boolean - Was a motif found?
 ***********************************************************************/
static BOOLEAN_T read_motif(
  FILE*    infile,      /* An open file in MEME output format. IN */
  ARRAY_T* background,  /* Pointer to array of bg freqs. IN */
  double   pseudocount, /* Pseudocount IN */
  MOTIF_T* a_motif      /* The motif (pre-allocated). OUT */
) {
  char word[MAX_LINE];          // Buffer for reading.
  int  num_scanned;             // How many elements were read by fscanf?
  char *  motif_id_end;          // Use to mark end of MOTIF marker
  char motif_id[MAX_MOTIF_ID_LENGTH + 5];
  char motif_id2[MAX_MOTIF_ID_LENGTH + 5];
                                // ID number of the current motif. Allow enough
                                // space for trailing </A> in input string

  /* Read until we reach the next motif. */
  num_scanned = 1;
  while (num_scanned == 1) {
    num_scanned = fscanf(infile, "%s", word);

    /* If the end of the file is reached, return FALSE. */
    if ((num_scanned != 1) || (strcmp(word, END_OF_MOTIF) == 0)) {
      return(FALSE);
    }

    /* If we found the next motif index, break. */
    if (strcmp(word, "MOTIF") == 0) {
      // allow a second word in the ID line to be read as "id2"
      // for use by Tomtom and possibly future programs
      char *line_ptr=NULL;
      int i, j;

      // Read the rest of the line
      line_ptr = getline2(infile);
      if (! line_ptr) {
        die("Error reading motif ID.\n");
      }
      int n_read = strlen(line_ptr);

      // copy the ID
      for (i=0; i<n_read-1 && isspace(line_ptr[i]); i++); // skip whitespace
      // read ID
      for (j=0;
        i<n_read-1 && !isspace(line_ptr[i]) && j<MAX_MOTIF_ID_LENGTH;
        i++, j++) {
        motif_id[j] = line_ptr[i];
      }
      motif_id[j] = '\0';
      if (j == 0) {
        die("Error reading motif ID.\n");
      }

      // copy the secondary ID
      for ( ; i<n_read-1 && isspace(line_ptr[i]); i++); // skip whitespace
      for (j=0;
        i<n_read-1 && !isspace(line_ptr[i]) && j<MAX_MOTIF_ID_LENGTH;
        i++, j++) {
        motif_id2[j] = line_ptr[i];
      }
      motif_id2[j] = '\0';

      myfree(line_ptr);

      break;			// done
    }

    /* Check for the other variant of the motif marker */
    if (strcmp(word, "HREF=\"#summary_doc\">MOTIF") == 0
      || strcmp(word, "href=\"#summary_doc\">MOTIF") == 0) {
      if (fscanf(infile, "%s", motif_id) != 1) {
        die("Error reading motif number.\n");
      }
      motif_id_end = strchr(motif_id, '<');
      if (motif_id_end == NULL) {
        die("Error reading motif ID.\n");
      }
      *motif_id_end = 0;
      motif_id2[0] = '\0';	// No secondary ID in this format
      break;
    }
  }

  /* Record the ID of this motif. */
  set_motif_id(motif_id, a_motif);
  set_motif_id2(motif_id2, a_motif);
  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, " %s", get_motif_id(a_motif));
  }

  /* Read the frequency matrix. */
  read_letter_freq(infile, background, pseudocount, a_motif);

  /* Store the alphabet size in the motif. */
  a_motif->alph_size = get_alph_size(ALPH_SIZE);
  a_motif->ambigs = get_alph_size(AMBIG_SIZE);

  // Compute and store the motif complexity.
  a_motif->complexity = compute_motif_complexity(a_motif);

  return(TRUE);
}

/***********************************************************************
 * Read just the motifs from a MEME file.
 ***********************************************************************/
static void read_motifs(
  FILE*    motif_file,       // MEME file. IN
  ARRAY_T* background,       // Pointer to array of bg freqs. IN
  double   pseudocount,      // Pseudocount IN
  int*     num_motifs,       // Number of motifs retrieved.  OUT
  MOTIF_T* motifs            // The retrieved motifs.  OUT
) {
  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, "Reading motif");
  }

  *num_motifs = 0;
  while (read_motif(motif_file, background, pseudocount, &(motifs[*num_motifs]))) {
    (*num_motifs)++;
  }

  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, ".\n");
  }

}

/***********************************************************************
 * Read the motifs from an XML document.
 * The caller is responsible for allocating the array of motifs.
 ***********************************************************************/
static void read_motifs_from_xml(
  xmlXPathContextPtr xpath_ctxt,    // MEME XPath context.
  ARRAY_T*  background,             // Pointer to array of bg freqs. IN
  double    pseudocount,            // pseudocount to be applied to motif IN
  int*      num_motifs,             // Number of motifs retrieved. OUT
  MOTIF_T*  motifs,                 // The retrieved motifs.  OUT
  STRING_LIST_T** motif_occurrences // Strings desc. motif occurrences  OUT
) {

  xmlXPathObjectPtr xpath_obj = NULL;
  xmlChar* property = NULL;
  char* path = "/MEME/motifs/motif"; // This path will select all motifs.

  // Use XPATH to get the set of motifs.
  xpath_obj = xpath_query(xpath_ctxt, path);

  *num_motifs = (xpath_obj->nodesetval ? xpath_obj->nodesetval->nodeNr : 0);

  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, "Reading motif ");
  }

  xmlNodePtr currMotifNode = NULL;
  int i_motif = 0;
  for (i_motif = 0; i_motif < *num_motifs; i_motif++) {

    currMotifNode = xpath_obj->nodesetval->nodeTab[i_motif];
    if (currMotifNode == NULL) {
      die("Error: missing motif %d\n", i_motif);
    }

    // Get the motif name attribute
    property = read_xml_node_property(currMotifNode, "name");
    set_motif_id((char *) property, &motifs[i_motif]);
    set_motif_id2("", &motifs[i_motif]);
    xmlFree(property);

    // Get the motif length attribute
    property = read_xml_node_property(currMotifNode, "width");
    motifs[i_motif].length = atoi((char *) property);
    xmlFree(property);

    // Get the motif evalue attribute
    property = read_xml_node_property(currMotifNode, "e_value");
    motifs[i_motif].evalue = atof((char *) property);
    xmlFree(property);

    // Get the motif sites attribute
    property = read_xml_node_property(currMotifNode, "sites");
    motifs[i_motif].num_sites = atof((char *) property);
    xmlFree(property);

    // Store the alphabet size in the motif.
    motifs[i_motif].alph_size = get_alph_size(ALPH_SIZE);
    motifs[i_motif].ambigs = get_alph_size(AMBIG_SIZE);

    // Get the motif id attribute
    xmlChar* motif_xml_id = read_xml_node_property(currMotifNode, "id");
    // Get the freq. matrix for the motif.
    read_letter_freq_from_xml(
      xpath_ctxt,
      background,
      pseudocount,
      (char *) motif_xml_id,
      &motifs[i_motif]
    );

    xmlFree(motif_xml_id);

    // Compute and store the motif complexity.
    motifs[i_motif].complexity = compute_motif_complexity(&motifs[i_motif]);

    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, " %s", get_motif_id(&motifs[i_motif]));
    }

  }

  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, ".\n");
  }

  /* Cleanup */
  xmlXPathFreeObject(xpath_obj);

}


/***********************************************************************
 * Create two copies of each motif.  The new IDs are preceded by "+"
 * and "-", and the "-" version is the reverse complement of the "+"
 * version.

void add_reverse_complements
  (int* num_motifs,
   MOTIF_T* motifs)
{
  int i_motif;                  // Index of the current motif.
  char motif_id[MAX_MOTIF_ID_LENGTH + 1]; // Name of the current motif;

  // Copy motifs.
  for (i_motif = 0; i_motif < *num_motifs; i_motif++) {
    copy_motif(&(motifs[i_motif]), &(motifs[*num_motifs + i_motif]));
    assert(motifs[i_motif].length != 0);
    assert(motifs[*num_motifs + i_motif].length != 0);
  }

  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, "Reverse complementing:");
  }

  // Add plusses to the first n motifs.
  motif_id[0] = '+';
  for (i_motif = 0; i_motif < *num_motifs; i_motif++) {
    strcpy(&(motif_id[1]), get_motif_id(&(motifs[i_motif])));
    set_motif_id(motif_id, &(motifs[i_motif]));

    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, " %s", motif_id);
    }
  }

  // Add minuses to the second n motifs.
  motif_id[0] = '-';
  for (i_motif = 0; i_motif < *num_motifs; i_motif++) {
    strcpy(&(motif_id[1]), get_motif_id(&(motifs[i_motif + *num_motifs])));
    set_motif_id(motif_id, &(motifs[i_motif + *num_motifs]));

    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, " %s", motif_id);
    }
  }

  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, "\n");
  }

  // Make the reverse complements.
  for (i_motif = 0; i_motif < *num_motifs; i_motif++) {
    reverse_complement_motif(&(motifs[i_motif + *num_motifs]));
  }

  // Double the motif counter.
  *num_motifs *= 2;
}
 ***********************************************************************/

/***********************************************************************
 * Create two copies of each motif.  The new IDs are preceded by "+"
 * and "-", and the "-" version is the reverse complement of the "+"
 * version.
 *
 * John Hawkins 2008 - I am changing this function so that the inverse
 * motifs are placed directly after the original in the array. This
 * helps with the implmentation of the BLS scan mode and has no apparent
 * effect on the other scan modes
 ***********************************************************************/
void add_reverse_complements
  (int* num_motifs,
   MOTIF_T* motifs)
{
  int i_motif;                  // Index of the current motif.
  char motif_id[MAX_MOTIF_ID_LENGTH + 1]; // Name of the current motif;

  // Copy motifs and change the IDs

  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, "Reverse complementing:");
  }

  //fprintf(stderr, "num motifs: %d\n", *num_motifs);

  // JH - Changed here so that copies end up directly after each original
  for (i_motif = (*num_motifs)-1; i_motif > -1; i_motif--) {

	// First copy the original to its new place
	if(i_motif > 0) {
	   copy_motif( &(motifs[i_motif]), &(motifs[ 2 * i_motif ]));
  }
	// Now make the second copy
	copy_motif( &(motifs[i_motif]), &(motifs[ 2 * i_motif + 1 ]));
  // Free freq. matrix from original.
	if(i_motif > 0) {
    free_matrix(motifs[i_motif].freqs);
  }

	// Now add plusses to the original motifs.
        motif_id[0] = '+';
	strcpy(&(motif_id[1]), get_motif_id(&(motifs[2 * i_motif])));
	set_motif_id(motif_id, &(motifs[2 * i_motif]));
	if (verbosity >= NORMAL_VERBOSE) {
           fprintf(stderr, " %s", motif_id);
        }

	// and add minuses to the copy motifs.
	motif_id[0] = '-';
	strcpy(&(motif_id[1]), get_motif_id(&(motifs[2 * i_motif + 1])));
    	set_motif_id(motif_id, &(motifs[2 * i_motif + 1]));
	if (verbosity >= NORMAL_VERBOSE) {
      		fprintf(stderr, " %s", motif_id);
    	}

	// Make the copy a reverse complement.
    	reverse_complement_motif(&(motifs[2 * i_motif + 1]));

  }
  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, "\n");
  }
  // Double the motif counter.
  *num_motifs *= 2;

}
 /***********************************************************************/

/***********************************************************************
 * Read the motif occurrences info from a MEME file.
 *
 * Each line contains the following items
 *  - sequence id,
 *  - sequence p-value,
 *  - number n of motif occurrences, and
 *  - length of sequence.
 *
 * This is followed by n triples containing
 *  - motif id,
 *  - occurrence position, and
 *  - occurrence p-value.
 *
 * Return value: Does the given file contain a motif occurrence section?
 ***********************************************************************/
static BOOLEAN_T read_motif_occurrences(
  FILE*      motif_file,            /* MEME file. IN */
  STRING_LIST_T** motif_occurences  /* List of motif occurence strings. OUT */
)
{
  char      line[MAX_LINE]; // Buffer for reading.

  // Look for the beginning of the motif occurrence section.
  while (TRUE) {
    if (fgets(line, MAX_LINE, motif_file) == NULL) {
      *motif_occurences = NULL;
      return(FALSE);
    }
    // The tag used to mark the begining of the
    // motif occurence section has changed slightly
    // over time. Check for both versions.
    if (strcmp(line, START_OCCURRENCES_1) == 0) {
      break;
    }
    if (strcmp(line, START_OCCURRENCES_2) == 0) {
      break;
    }
    if (strcmp(line, START_OCCURRENCES_3) == 0) {
      break;
    }
  }

  *motif_occurences = new_string_list();

  // Read each line, corresponding to each sequence.
  while (fgets(line, MAX_LINE, motif_file)) {
    // Check to see if we've reached the end of the section.
    // The tag used to mark the end of the
    // motif occurence section has changed slightly
    // over time. Check for both versions.
    if (strcmp(line, END_OCCURRENCES_1) == 0) {
      break;
    }
    if (strcmp(line, END_OCCURRENCES_2) == 0) {
      break;
    }
    add_string(line, *motif_occurences);
  }

  return(TRUE);
}

/***********************************************************************
 * Look up the sequence name corresponding to a sequence id in
 * the XML file.
 *
 * Returns the squence name. Caller is responsible for freeing the name.
 ***********************************************************************/
static xmlChar* get_sequence_name_from_id(xmlXPathContextPtr ctxt, xmlChar* id) {

  xmlXPathObjectPtr sequence_obj;
  char xpath_expression[MAX_XPATH_EXPRESSION];
  xmlNodePtr node = NULL;

  int char_written = snprintf(
    xpath_expression,
    MAX_XPATH_EXPRESSION,
    "/MEME/training_set/sequence[@id='%s']",
    id
  );

  // Check that we didn't have to truncate the string.
  if (char_written >= MAX_XPATH_EXPRESSION) {
    die("Unable to get sequence name from sequence id. XPath expression exceeded maxiumum allowed size.");
  }
  sequence_obj = xpath_query(ctxt, xpath_expression);
  if (sequence_obj == NULL) {
    die("Unable to query XML file for name of sequence with id %s.", id);
  }
  node = sequence_obj->nodesetval->nodeTab[0];
  if (node == NULL) {
    die("Unable to find name for sequence with id %s.", id);
  }
  xmlChar* sequence_name = xmlGetProp(node, BAD_CAST "name");
  xmlXPathFreeObject(sequence_obj);

  if (sequence_name == NULL) {
    die("Unable to find name for sequence with id %s.", id);
  }

  return sequence_name;
}

/***********************************************************************
 * Look up the motif name corresponding to a motif id in
 * the XML file.
 *
 * Returns the squence name. Caller is responsible for freeing the name.
 ***********************************************************************/
static xmlChar* get_motif_name_from_id(xmlXPathContextPtr ctxt, xmlChar* id) {

  xmlXPathObjectPtr motif_obj;
  char xpath_expression[MAX_XPATH_EXPRESSION];
  xmlNodePtr node = NULL;

  int char_written = snprintf(
    xpath_expression,
    MAX_XPATH_EXPRESSION,
    "/MEME/motifs/motif[@id='%s']",
    id
  );

  // Check that we didn't have to truncate the string.
  if (char_written >= MAX_XPATH_EXPRESSION) {
    die("Unable to get motif name from motif id. XPath expression exceeded maxiumum allowed size.");
  }
  motif_obj = xpath_query(ctxt, xpath_expression);
  if (motif_obj == NULL) {
    die("Unable to query XML file for name of motif with id %s.", id);
  }
  node = motif_obj->nodesetval->nodeTab[0];
  if (node == NULL) {
    die("Unable to find name for motif with id %s.", id);
  }
  xmlChar* motif_name = xmlGetProp(node, BAD_CAST "name");
  xmlXPathFreeObject(motif_obj);

  if (motif_name == NULL) {
    die("Unable to find name for motif with id %s.", id);
  }

  return motif_name;
}
/***********************************************************************
 * Look up the sequence length corresponding to a sequence id
 * in the XML file.
 *
 * Returns the squence length as a string. Caller is responsible
 * for freeing the string.
 ***********************************************************************/
static xmlChar* get_sequence_length_from_id(xmlXPathContextPtr ctxt, xmlChar* id) {

  xmlXPathObjectPtr sequence_obj;
  char xpath_expression[MAX_XPATH_EXPRESSION];
  xmlNodePtr node = NULL;

  int char_written = snprintf(
    xpath_expression,
    MAX_XPATH_EXPRESSION,
    "/MEME/training_set/sequence[@id='%s']",
    id
  );

  // Check that we didn't have to truncate the string.
  if (char_written >= MAX_XPATH_EXPRESSION) {
    die("Unable to get sequence name from sequence id. XPath expression exceeded maxiumum allowed size.");
  }
  sequence_obj = xpath_query(ctxt, xpath_expression);
  if (sequence_obj == NULL) {
    die("Unable to query XML file for length of sequence with id %s.", id);
  }
  node = sequence_obj->nodesetval->nodeTab[0];
  if (node == NULL) {
    die("Unable to find length for sequence with id %s.", id);
  }
  xmlChar* sequence_length = xmlGetProp(node, BAD_CAST "length");
  xmlXPathFreeObject(sequence_obj);

  if (sequence_length == NULL) {
    die("Unable to find length for sequence with id %s.", id);
  }

  return sequence_length;
}

/***********************************************************************
 * Read the motif occurrences info from MEME XML.
 *
 * Each line contains the following items
 *  - sequence id,
 *  - sequence p-value,
 *  - number n of motif occurrences, and
 *  - length of sequence.
 *
 * This is followed by n triples containing
 *  - motif id,
 *  - occurrence position, and
 *  - occurrence p-value.
 *
 * Return value: Does the given file contain a motif occurrence section?
 ***********************************************************************/
static BOOLEAN_T read_motif_occurrences_from_xml(
  xmlXPathContextPtr xpath_ctxt,    // MEME XPath context.
  STRING_LIST_T** motif_occurrences  // List of motif occurrence strings. OUT
) {

  *motif_occurrences = new_string_list();

  // Get all the scanned sites.
  xmlXPathObjectPtr scanned_sites_obj = NULL;
  scanned_sites_obj = xpath_query(xpath_ctxt, "/MEME/scanned_sites_summary/scanned_sites");

  // Get scanned sites for each sequence.
  int num_sequences =
    (scanned_sites_obj->nodesetval ? scanned_sites_obj->nodesetval->nodeNr : 0);
  xmlNodePtr currOccurenceNode = NULL;
  int i = 0;
  for (i = 0; i < num_sequences; i++) {

    currOccurenceNode = scanned_sites_obj->nodesetval->nodeTab[i];
    xmlChar* sequence_id = xmlGetProp(currOccurenceNode, BAD_CAST "sequence_id");
    xmlChar* pvalue = xmlGetProp(currOccurenceNode, BAD_CAST "pvalue");
    xmlChar* num_sites = xmlGetProp(currOccurenceNode, BAD_CAST "num_sites");
    xmlChar* sequence_name = get_sequence_name_from_id(xpath_ctxt, sequence_id);
    xmlChar* sequence_length = get_sequence_length_from_id(xpath_ctxt, sequence_id);
    char occurrence_string[255];
    strcpy(occurrence_string, (char *) sequence_name);
    strcat(occurrence_string, " ");
    strcat(occurrence_string, (char *) pvalue);
    strcat(occurrence_string, " ");
    strcat(occurrence_string, (char *) num_sites);
    strcat(occurrence_string, " ");
    strcat(occurrence_string, (char *) sequence_length);
    strcat(occurrence_string, " ");
    xmlFree(sequence_id);
    xmlFree(pvalue);
    xmlFree(num_sites);
    xmlFree(sequence_name);
    xmlFree(sequence_length);

    xmlNodePtr currChild = currOccurenceNode->children;
    while (currChild != NULL) {

      if (currChild->type == XML_ELEMENT_NODE) {
        xmlChar* motif_id = xmlGetProp(currChild, BAD_CAST "motif_id");
        xmlChar* motif_name = get_motif_name_from_id(xpath_ctxt, motif_id);
        xmlChar* strand = xmlGetProp(currChild, BAD_CAST "strand");
        xmlChar* position = xmlGetProp(currChild, BAD_CAST "position");
        xmlChar* site_pvalue = xmlGetProp(currChild, BAD_CAST "pvalue");
        if (strncmp("plus", (char *) strand, 4) == 0) {
          strcat(occurrence_string, "+");
        }
        else if (strncmp("minus", (char *) strand, 5) == 0) {
          strcat(occurrence_string, "-");
        }
        strcat(occurrence_string, (char *) motif_name);
        strcat(occurrence_string, " ");
        strcat(occurrence_string, (char *) position);
        strcat(occurrence_string, " ");
        strcat(occurrence_string, (char *) site_pvalue);
        strcat(occurrence_string, " ");
        xmlFree(motif_id);
        xmlFree(motif_name);
        xmlFree(strand);
        xmlFree(position);
        xmlFree(site_pvalue);
      }
      currChild = currChild->next;

    }

    add_string(occurrence_string, *motif_occurrences);

  }

  xmlXPathFreeObject(scanned_sites_obj);

  return(TRUE);

}

/*************************************************************************
 * Setup motif-to-motif occurrence and spacer length frequency
 * transition matrices in a naive fashion, without taking into account
 * any motif occurrence information.
 *************************************************************************/
#define SPACER_LENGTH 9.0 // Expected length of a spacer.
void compute_naive_transitions_and_spacers
  (const int  nmotifs,     // The number of motifs.
   MATRIX_T** transp_freq, // Motif-to-motif occurrence matrix.
   MATRIX_T** spacer_ave)  // Average spacer length matrix.
{
  int   i;
  int   j;
  PROB_T prob;

  // Allocate the matrices.
  *transp_freq = allocate_matrix(nmotifs + 2, nmotifs + 2);
  *spacer_ave = allocate_matrix(nmotifs + 2, nmotifs + 2);

  // Set up transition frequencies and spacer lengths from start state.
  prob = 1.0 / (PROB_T)nmotifs;
  for (j = 1; j <= nmotifs; j++) {
    set_matrix_cell(0, j, prob, *transp_freq);
    set_matrix_cell(0, j, SPACER_LENGTH, *spacer_ave);
  }

  /* Set up transition frequencies and spacer lengths between motifs
     and to the end state. */
  prob = 1.0 / ((PROB_T)(nmotifs + 1));
  for (i = 1; i <= nmotifs; i++) {
    for (j = 1; j <= nmotifs+1; j++) {
      set_matrix_cell(i, j, prob, *transp_freq);
      set_matrix_cell(i, j, SPACER_LENGTH, *spacer_ave);
    }
  }
}

/***********************************************************************
 * Read a MEME file in XML format using the libxml2 XML parsing tools.
 ***********************************************************************/
BOOLEAN_T read_meme_xml_file(
   char*      meme_filename,        // Name of MEME file  IN
   char*      bg_filename,            // Name of background freq. file IN
   double     pseudocount,          // Pseudocount to be applied motif freqs. IN
   int*       num_motifs,           // Number of motifs retrieved  OUT
   MOTIF_T*   motifs,               // The retrieved motifs
   STRING_LIST_T** motif_occurrences, // Strings desc. motif occurrences  OUT
   BOOLEAN_T* has_reverse_strand,     // Does this file have both strands? OUT
   ARRAY_T**  background              // Background emission distribution  OUT
)
{
  xmlParserCtxtPtr ctxt = NULL;         // The parser context
  xmlDocPtr meme_doc = NULL;            // The resulting document tree
  xmlXPathContextPtr xpath_ctxt = NULL; // XPath context.

  ctxt = xmlNewParserCtxt();
  if (ctxt == NULL) {
    die("Failed to create XML parser.\n");
  }

  // Parse and validate the file.
  meme_doc = xmlCtxtReadFile(
    ctxt,
    meme_filename,
    NULL,  // Encoding
    XML_PARSE_RECOVER | XML_PARSE_DTDVALID | XML_PARSE_NOERROR | XML_PARSE_NOWARNING
  );

  // Did it parse?
  if (meme_doc == NULL || ctxt->valid == 0) {
   xmlFreeDoc(meme_doc);
   xmlFreeParserCtxt(ctxt);
   xmlCleanupParser();
   return FALSE;
  }
  if (verbosity > NORMAL_VERBOSE) {
    fprintf(stderr, "File %s is a valid MEME XML file.\n", meme_filename);
  }

  // Set up XPath context from parsed XML
  xpath_ctxt = xmlXPathNewContext(meme_doc);

  // Read the alphabet.
  read_alphabet_from_xml(xpath_ctxt);

  // Read the strandedness.
  if (which_alphabet() == DNA_ALPH) {
    *has_reverse_strand = read_strand_from_xml(xpath_ctxt);
  } else {
    *has_reverse_strand = FALSE;
  }

  // Establish the background frequencies.
  if (bg_filename == NULL) {
    // Default is to use pre-calculated bg freq. from NR database.
    if (verbosity >= HIGH_VERBOSE) {
      fprintf(
        stderr,
        "Using background frequencies from NR sequence database.\n"
      );
    }
    *background = allocate_array(get_alph_size(ALL_SIZE));
    get_nrdb_frequencies(*background);
  }
  else {
    if (strcmp(bg_filename, "motif-file") == 0) {
      // If bg_filename matches "motif-file" read bg freq. from motif file.
      if (verbosity >= NORMAL_VERBOSE) {
        fprintf(
          stderr,
          "Using background frequencies from file %s.\n",
          meme_filename
        );
      }
      *background = read_bg_freqs_from_xml(xpath_ctxt);
    }
    else {
      // Otherwise read bg freqs. from external bg file.
      if (verbosity >= NORMAL_VERBOSE) {
        fprintf(
          stderr,
          "Using background frequencies from file %s.\n",
          bg_filename
        );
      }
      *background = read_background_file(bg_filename);;
    }
  }

  // Read the motifs
  read_motifs_from_xml(
    xpath_ctxt,
    *background,
    pseudocount,
    num_motifs,
    motifs,
    motif_occurrences
  );

  // Read the motif occurences
  read_motif_occurrences_from_xml(xpath_ctxt, motif_occurrences);

  /* free up the resulting document */
  xmlXPathFreeContext(xpath_ctxt);
  xmlFreeDoc(meme_doc);
  xmlFreeParserCtxt(ctxt);
  xmlCleanupParser();

  return TRUE;

}

/***********************************************************************
 * Read a MEME file.
 ***********************************************************************/
void read_meme_file(
   char*      meme_filename,          // Name of MEME file  IN
   char*      bg_filename,            // Name of background freq. file IN
   double     pseudocount,            // Value of pseudocount IN
   int*       num_motifs,             // Number of motifs retrieved  OUT
   MOTIF_T*   motifs,                 // The retrieved motifs
   STRING_LIST_T** motif_occurrences, // Strings desc. motif occurrences  OUT
   BOOLEAN_T* has_reverse_strand,     // Does this file have both strands? OUT
   ARRAY_T**  background              // Background emission distribution  OUT
)
{
  // First try to read the MEME file as XML.
  BOOLEAN_T read_file = FALSE;
  read_file = read_meme_xml_file(
     meme_filename,
     bg_filename,
     pseudocount,
     num_motifs,
     motifs,
     motif_occurrences,
     has_reverse_strand,
     background
  );
  if (read_file) {
    return;
  }

  // Open the given MEME file.
  FILE*      motif_file;         // MEME file containing the motifs.
  if (open_file(meme_filename, "r", TRUE, "motif", "motifs", &motif_file) == 0)
    exit(1);

  // Check to be sure we can read this version.
  char* version = read_version(motif_file);

  // Read the alphabet.
  read_alphabet(motif_file);

  // Read the strandedness of the motifs.
  if (which_alphabet() == DNA_ALPH) {
    *has_reverse_strand = read_strand(motif_file);
  }
  else {
    *has_reverse_strand = FALSE;
  }

  // Establish the background frequencies.
  if (bg_filename == NULL) {
    // Default is to use pre-calculated bg freq. from NR database.
    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(
        stderr,
        "Using background frequencies from NR sequence database.\n"
      );
    }
    *background = allocate_array(get_alph_size(ALL_SIZE));
    get_nrdb_frequencies(*background);
  }
  else {
    if (strcmp(bg_filename, "motif-file") == 0) {
      // If bg_filename matches "motif-file" read bg freq. from motif file.
      if (verbosity >= NORMAL_VERBOSE) {
        fprintf(
          stderr,
          "Using background frequencies from file %s.\n",
          meme_filename
        );
      }
      *background = read_freqs(motif_file);
    }
    else {
      // Otherwise read bg freqs. from external bg file.
      if (verbosity >= NORMAL_VERBOSE) {
        fprintf(
          stderr,
          "Using background frequencies from file %s.\n",
          bg_filename
        );
      }
      *background = read_background_file(bg_filename);;
    }
  }

  // Read the specified motifs
  read_motifs(motif_file, *background, pseudocount, num_motifs, motifs);

  // Read the motif occurences
  read_motif_occurrences(motif_file, motif_occurrences);

  /* Close the MEME file. */
  fclose(motif_file);
}

/***********************************************************************
 * Print out a given motif.
 ***********************************************************************/
static void write_motif(
  MOTIF_T* a_motif,     /* A data structure containing the motif. */
  FILE*    outfile      /* An ASCII version of the motif in MEME  */
) {                     /* output format.                         */
  int i;
  int j;

  fprintf(outfile, "MOTIF %s\n\n", a_motif->id);
  fprintf(outfile, "BL   MOTIF %s width=%d seqs=%6.3f\n", a_motif->id,
    a_motif->length, a_motif->num_sites);
  fprintf(outfile, "letter-probability matrix: alength= %d ",
    a_motif->alph_size);
  fprintf(outfile, "w= %d ", a_motif->length);
  fprintf(outfile, "nsites= %6.3f ", a_motif->num_sites);
  fprintf(outfile, "E= %g ", a_motif->evalue);
  fprintf(outfile, "complexity= %6.3f\n", a_motif->complexity);
  for (i = 0; i < a_motif->length; i++) {
    for (j = 0; j < a_motif->alph_size; j++) {
      fprintf(outfile, "%9.6f ", get_matrix_cell(i, j, a_motif->freqs));
    }
    fprintf(outfile, "\n");
  }
  fprintf(outfile, "\n\n");
}

#ifdef MAIN
#include "simple-getopt.h"

VERBOSE_T verbosity = INVALID_VERBOSE;

/*************************************************************************
 * int main
 *************************************************************************/
int main(int argc, char *argv[])
{
  // Data structures.
  int       num_motifs;         // The number of motifs in the model.
  MOTIF_T   motifs[2 * MAX_MOTIFS]; // The motifs.
  ARRAY_T*  background;         // Background probs for alphabet.
  ORDER_T*  order_spacing;      // Linear HMM order and spacing.

  // Command line parameters.
  char *    meme_filename;      // Input file containg motifs.
  BOOLEAN_T ids_only;           // Print only the motif ids?
  BOOLEAN_T reorder;            // Reorder the motifs?
  BOOLEAN_T has_reverse_strand; // MEME file contains both strands
  STRING_LIST_T* motif_occurrences; // Strings describing motif occurences.
  STRING_LIST_T* requested_motifs; // Indices of requested motifs.
  int       request_n;          // The user asked for the first n motifs.
  double    e_threshold;        // E-value threshold for motif inclusion.
  double    complexity_threshold; // For eliminating low complexity motifs.
  double    p_threshold;        // p-value threshold for motif occurences.
  BOOLEAN_T keep_unused;        // Keep unused transitions?
  char*     order_string;       // Motif order and spacing. *

  // Local variables.
  int       i_motif;
  int       alph_size;
  int       i_alph;

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  // Define command line options.
  cmdoption const options[] = {
    {"indices", NO_VALUE},
    {"reorder", NO_VALUE},
    {"motif", REQUIRED_VALUE},
    {"nmotifs", REQUIRED_VALUE},
    {"ethresh", REQUIRED_VALUE},
    {"lowcomp", REQUIRED_VALUE},
    {"pthresh", REQUIRED_VALUE},
    {"keep-unused", NO_VALUE},
    {"order", REQUIRED_VALUE},
    {"transpseudo", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE}
  };
  int option_count = 11;
  int option_index = 0;

  // Define the usage message.
  char      usage[1000] = "";
  strcpy(usage, "USAGE: meme-io [options] <motifs>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");
  strcat(usage, "     --indices\n");
  strcat(usage, "     --reorder\n");
  strcat(usage, "     --motif <motif #> (may be repeated)\n");
  strcat(usage, "     --nmotifs <#>\n");
  strcat(usage, "     --ethresh <E-value>\n");
  strcat(usage, "     --lowcomp <value>\n");
  strcat(usage, "     --pthresh <p-value>\n");
  strcat(usage, "     --keep-unused\n");
  strcat(usage, "     --order <string>\n");
  strcat(usage, "     --verbosity 1|2|3|4|5 (default=2)\n");
  strcat(usage, "\n");

  // Initialize.
  order_spacing = NULL;

  // Make sure various options are set to NULL or defaults.
  meme_filename = NULL;
  ids_only = FALSE;
  reorder = FALSE;
  request_n = 0;
  requested_motifs = new_string_list();
  e_threshold = 0.0;
  complexity_threshold = 0.0;
  p_threshold = 0.0;
  keep_unused = FALSE;
  order_string = NULL;
  verbosity = NORMAL_VERBOSE;

  simple_setopt(argc, argv, option_count, options);

  // Parse the command line.
  while (1) {

    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char* message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }

    if (strcmp(option_name, "indices") == 0) {
      ids_only = TRUE;
    } else if (strcmp(option_name, "reorder") == 0) {
      reorder = TRUE;
    } else if (strcmp(option_name, "motif") == 0) {
      add_string(option_value, requested_motifs);
    } else if (strcmp(option_name, "nmotifs") == 0) {
      request_n = atoi(option_value);
    } else if (strcmp(option_name, "ethresh") == 0) {
      e_threshold = atof(option_value);
    } else if (strcmp(option_name, "lowcomp") == 0) {
      complexity_threshold = atof(option_value);
    } else if (strcmp(option_name, "pthresh") == 0) {
      p_threshold = atof(option_value);
    } else if (strcmp(option_name, "keep-unused") == 0) {
      keep_unused = TRUE;
    } else if (strcmp(option_name, "order") == 0) {
      order_string = option_value;
    } else if (strcmp(option_name, "verbosity") == 0) {
      verbosity = (VERBOSE_T)atoi(option_value);
    }
  }

  // Read the single required argument.
  if (option_index + 1 != argc) {
    fprintf(stderr, usage);
    exit(1);
  }
  meme_filename = argv[option_index];

  // Set up motif requests.
  if (request_n != 0) {
    if (get_num_strings(requested_motifs) != 0) {
      die("Can't combine the -motif and -nmotifs options.\n");
    } else {
      for (i_motif = 0; i_motif < request_n; i_motif++) {
        char motif_id[MAX_MOTIF_ID_LENGTH + 1];
        sprintf(motif_id, "%d", i_motif + 1);
        add_string(motif_id, requested_motifs);
      }
    }
  }

  // Make sure motifs weren't selected redundantly.
  if ((get_num_strings(requested_motifs) != 0) && (e_threshold != 0.0)) {
    die("Can't use -motif or -nmotifs with -ethresh.");
  }
  if ((get_num_strings(requested_motifs) != 0) && (order_string != NULL)) {
    die("Can't use -motif or -nmotifs with -order.");
  }
  if ((order_string != NULL) && (e_threshold != 0.0)) {
    die("Can't use -ethresh and -order.");
  }

  // Parse the order string.
  order_spacing = create_order(order_string);

  /**********************************************
   * READING THE MOTIFS
   **********************************************/

  // Read all the motifs from the MEME file.
  read_meme_file(
     meme_filename,
     "motif-file", // Name of background freq. file
     0.0, // pseudocount
     &num_motifs,
     motifs,
     &motif_occurrences,
     &has_reverse_strand,
     &background
  );

  /**********************************************
   * WRITING THE MOTIFS
   **********************************************/
  if (ids_only) {
    for (i_motif = 0; i_motif < num_motifs; i_motif++) {
      printf("%s ", get_motif_id(&(motifs[i_motif])));
    }
    printf("\n");
  } else {

    printf("Meta-MEME version %s\n", VERSION);
    printf("ALPHABET= %s\n\n", get_alphabet(FALSE));
    printf("strands: +\n\n");

    // Print the background.
    printf("%s\n", LETTER_FREQUENCY);
    {
      int alph_size;
      int i;

      alph_size = get_alph_size(ALPH_SIZE);
      for (i = 0; i < alph_size; i++) {
        printf("%c %5.3f ", get_alph_char(i), get_array_item(i, background));
      }
      printf("\n\n");
    }

    // Print the requested motifs.
    for (i_motif = 0; i_motif < num_motifs; i_motif++) {
      write_motif(&(motifs[i_motif]), stdout);
    }

    // Print the letter frequencies.
    printf("Letter frequencies:\n");
    alph_size = get_alph_size(ALPH_SIZE);
    for (i_alph = 0; i_alph < alph_size; i_alph++) {
      printf("%c %5.3f ", get_alph_char(i_alph),
      get_array_item(i_alph, background));
    }
    printf("\n");
  }

  // Clean up and exit.
  free_string_list(requested_motifs);
  free_order(order_spacing);
  for (i_motif = 0; i_motif < num_motifs; i_motif++) {
    free_motif(&(motifs[i_motif]));
  }
  free_array(background);
  return(0);
}
#endif


/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 2
 * End:
 */

