src/util/creaders/alnread.c

Go to the documentation of this file.
00001 /*
00002  * $Id: alnread.c 171976 2009-09-30 16:00:49Z bollin $
00003  *
00004  * ===========================================================================
00005  *
00006  *                            PUBLIC DOMAIN NOTICE
00007  *               National Center for Biotechnology Information
00008  *
00009  *  This software/database is a "United States Government Work" under the
00010  *  terms of the United States Copyright Act.  It was written as part of
00011  *  the author's official duties as a United States Government employee and
00012  *  thus cannot be copyrighted.  This software/database is freely available
00013  *  to the public for use. The National Library of Medicine and the U.S.
00014  *  Government have not placed any restriction on its use or reproduction.
00015  *
00016  *  Although all reasonable efforts have been taken to ensure the accuracy
00017  *  and reliability of the software and data, the NLM and the U.S.
00018  *  Government do not and cannot warrant the performance or results that
00019  *  may be obtained by using this software or data. The NLM and the U.S.
00020  *  Government disclaim all warranties, express or implied, including
00021  *  warranties of performance, merchantability or fitness for any particular
00022  *  purpose.
00023  *
00024  *  Please cite the author in any work or product based on this material.
00025  *
00026  * ===========================================================================
00027  *
00028  * Authors:  Colleen Bollin
00029  *
00030  */
00031 
00032 #include <util/creaders/alnread.h>
00033 #include <stdio.h>
00034 #include <stdlib.h>
00035 #include <string.h>
00036 #include <ctype.h>
00037 
00038 static const int kMaxPrintedIntLen = 10;
00039 #define MAX_PRINTED_INT_LEN_PLUS_ONE 11
00040 
00041 typedef enum {
00042     eTrue = -1,
00043     eFalse = 0
00044 } EBool;
00045 
00046 /* structures used internally */
00047 typedef struct SLineInfo {
00048     char *             data;
00049     int                line_num;
00050     int                line_offset;
00051     EBool              delete_me;
00052     struct SLineInfo * next;
00053 } SLineInfo, * TLineInfoPtr;
00054 
00055 typedef struct SLineInfoReader {
00056     TLineInfoPtr first_line;
00057     TLineInfoPtr curr_line;
00058     char *       curr_line_pos;
00059     int          data_pos;
00060 } SLineInfoReader, * TLineInfoReaderPtr;
00061 
00062 typedef struct SIntLink {
00063     int               ival;
00064     struct SIntLink * next;
00065 } SIntLink, * TIntLinkPtr;
00066 
00067 typedef struct SStringCount {
00068     char *                string;
00069     int                   num_appearances;
00070     TIntLinkPtr           line_numbers;
00071     struct SStringCount * next;
00072 } SStringCount, * TStringCountPtr;
00073 
00074 typedef struct SSizeInfo {
00075     int                size_value;
00076     int                num_appearances;
00077     struct SSizeInfo * next;
00078 } SSizeInfo, * TSizeInfoPtr;
00079 
00080 typedef struct SLengthList {
00081     TSizeInfoPtr         lengthrepeats;
00082     int                  num_appearances;
00083     struct SLengthList * next;
00084 } SLengthListData, * SLengthListPtr;
00085  
00086 typedef struct SCommentLoc {
00087   char *               start;
00088   char *               end;
00089   struct SCommentLoc * next;
00090 } SCommentLoc, * TCommentLocPtr;
00091 
00092 typedef struct SBracketedCommentList 
00093 {
00094     TLineInfoPtr                   comment_lines;
00095     struct SBracketedCommentList * next;
00096 } SBracketedCommentList, * TBracketedCommentListPtr;
00097 
00098 typedef struct SAlignRawSeq {
00099     char *                id;
00100     TLineInfoPtr          sequence_data;
00101     TIntLinkPtr           id_lines;
00102     struct SAlignRawSeq * next;
00103 } SAlignRawSeq, * TAlignRawSeqPtr;
00104 
00105 typedef struct SAlignFileRaw {
00106     TLineInfoPtr         line_list;
00107     TLineInfoPtr         organisms;
00108     TAlignRawSeqPtr      sequences;
00109     int                  num_organisms;
00110     TLineInfoPtr         deflines;
00111     int                  num_deflines;
00112     EBool                marked_ids;
00113     int                  block_size;
00114     TIntLinkPtr          offset_list;
00115     FReportErrorFunction report_error;
00116     void *               report_error_userdata;
00117     char *               alphabet;
00118     int                  expected_num_sequence;
00119     int                  expected_sequence_len;
00120     int                  num_segments;
00121     char                 align_format_found;
00122 } SAlignRawFileData, * SAlignRawFilePtr;
00123 
00124 /* These functions are used for storing and transmitting information
00125  * about errors encountered while reading the alignment data.
00126  */
00127 
00128 /* This function allocates memory for a new error structure and populates
00129  * the structure with default values.
00130  * The new structure will be added to the end of the linked list of error
00131  * structures pointed to by list.
00132  */
00133 extern TErrorInfoPtr ErrorInfoNew (TErrorInfoPtr list)
00134 {
00135     TErrorInfoPtr eip, last;
00136 
00137     eip = (TErrorInfoPtr) malloc ( sizeof (SErrorInfo));
00138     if (eip == NULL) {
00139         return NULL;
00140     }
00141     eip->category = eAlnErr_Unknown;
00142     eip->line_num = -1;
00143     eip->id       = NULL;
00144     eip->message  = NULL;
00145     eip->next     = NULL;
00146     last = list;
00147     while (last != NULL && last->next != NULL) {
00148         last = last->next;
00149     }
00150     if (last != NULL) {
00151         last->next = eip;
00152     }
00153     return eip;
00154 }
00155 
00156 
00157 /* This function recursively frees the memory associated with a list of
00158  * error structures as well as the member variables of the error structures.
00159  */
00160 extern void ErrorInfoFree (TErrorInfoPtr eip)
00161 {
00162     if (eip == NULL) {
00163         return;
00164     }
00165     ErrorInfoFree (eip->next);
00166     free (eip->id);
00167     free (eip->message);
00168     free (eip);
00169 }
00170 
00171 /* This function creates and sends an error message regarding a NEXUS comment
00172  * character.
00173  */
00174 static void 
00175 s_ReportCharCommentError 
00176 (char * expected,
00177  char    seen,
00178  char * val_name,
00179  FReportErrorFunction errfunc,
00180  void *             errdata)
00181 {
00182     TErrorInfoPtr eip;
00183     const char * errformat = "Specified %s character does not match NEXUS"
00184                              " comment in file (specified %s, comment %c)";
00185 
00186     if (errfunc == NULL  ||  val_name == NULL || expected == NULL) {
00187         return;
00188     }
00189 
00190     eip = ErrorInfoNew (NULL);
00191     if (eip != NULL) {
00192         eip->category = eAlnErr_BadFormat;
00193         eip->message = (char *) malloc (strlen (errformat) + strlen (val_name)
00194                                         + strlen (expected) + 2);
00195         if (eip->message != NULL) {
00196             sprintf (eip->message, errformat, val_name, expected, seen);
00197         }
00198         errfunc (eip, errdata);
00199     }
00200 }
00201 
00202 
00203 /* This function creates and sends an error message regarding a character
00204  * that is unexpected in sequence data.
00205  */
00206 static void 
00207 s_ReportBadCharError 
00208 (char *  id,
00209  char    bad_char,
00210  int     num_bad,
00211  int     offset,
00212  int     line_number,
00213  char *  reason,
00214  FReportErrorFunction errfunc,
00215  void *             errdata)
00216 {
00217     TErrorInfoPtr eip;
00218     const char *  err_format =
00219                           "%d bad characters (%c) found at position %d (%s).";
00220 
00221     if (errfunc == NULL  ||  num_bad == 0  ||  bad_char == 0
00222         ||  reason == NULL) {
00223         return;
00224     }
00225 
00226     eip = ErrorInfoNew (NULL);
00227     if (eip == NULL) {
00228         return;
00229     }
00230 
00231     eip->category = eAlnErr_BadData;
00232     if (id != NULL) eip->id = strdup (id);
00233     eip->line_num = line_number;
00234     eip->message = (char *) malloc (strlen (err_format) + 2 * kMaxPrintedIntLen
00235                                     + strlen (reason) + 3);
00236     if (eip->message != NULL)
00237     {
00238         sprintf (eip->message, err_format, num_bad, bad_char, offset, reason);
00239     }
00240     errfunc (eip, errdata);
00241 }
00242  
00243 
00244 /* This function creates and sends an error message regarding an ID that
00245  * was found in the wrong location.
00246  */
00247 static void 
00248 s_ReportInconsistentID 
00249 (char *               id,
00250  int                  line_number,
00251  FReportErrorFunction report_error,
00252  void *              report_error_userdata)
00253 {
00254     TErrorInfoPtr eip;
00255 
00256     if (report_error == NULL) {
00257         return;
00258     }
00259     eip = ErrorInfoNew (NULL);
00260     if (eip == NULL) {
00261         return;
00262     }
00263     eip->category = eAlnErr_BadFormat;
00264     eip->id = strdup (id);
00265     eip->line_num = line_number;
00266     eip->message = strdup ("Found unexpected ID");
00267     report_error (eip, report_error_userdata);
00268 }
00269 
00270 
00271 /* This function creates and sends an error message regarding a line
00272  * of sequence data that was expected to have a different length.
00273  */
00274 static void 
00275 s_ReportInconsistentBlockLine 
00276 (char *               id,
00277  int                  line_number,
00278  FReportErrorFunction report_error,
00279  void *              report_error_userdata)
00280 {
00281     TErrorInfoPtr eip;
00282 
00283     if (report_error == NULL) {
00284         return;
00285     }
00286     eip = ErrorInfoNew (NULL);
00287     if (eip == NULL) {
00288         return;
00289     }
00290     eip->category = eAlnErr_BadFormat;
00291     eip->id = strdup (id);
00292     eip->line_num = line_number;
00293     eip->message = strdup ("Inconsistent block line formatting");
00294     report_error (eip, report_error_userdata);
00295 }
00296 
00297 
00298 #if 0
00299 /* this section was removed by indexer request */
00300 /* This function creates and sends an error message regarding mismatched
00301  * definition lines
00302  */
00303 static void
00304 s_ReportDefinitionLineMismatch
00305 (FReportErrorFunction report_error,
00306  void *              report_error_userdata)
00307 {
00308     TErrorInfoPtr eip;
00309 
00310     if (report_error == NULL) {
00311         return;
00312     }
00313     eip = ErrorInfoNew (NULL);
00314     if (eip == NULL) {
00315         return;
00316     }
00317 
00318     eip->category = eAlnErr_BadData;
00319     eip->message = strdup ("Mismatched definition lines");
00320     report_error (eip, report_error_userdata);
00321 }
00322 #endif
00323 
00324 
00325 /* This function recursively creates and sends an error message 
00326  * regarding the number of times items in list appear.
00327  */
00328 static void 
00329 s_ReportDefinitionLines 
00330 (TStringCountPtr      list,
00331  FReportErrorFunction report_error,
00332  void *              report_error_userdata)
00333 {
00334     TErrorInfoPtr eip;
00335     const char *  err_null_format = "Null definition line occurs %d times";
00336     const char *  err_format = "Definition line %s occurs %d times";
00337 
00338     if (list == NULL  ||  report_error == NULL) {
00339         return;
00340     }
00341     eip = ErrorInfoNew (NULL);
00342     if (eip == NULL) {
00343         return;
00344     }
00345 
00346     eip->category = eAlnErr_BadData;
00347     if (list->string == NULL) {
00348         eip->message = (char*)malloc (strlen (err_null_format)
00349                                       + kMaxPrintedIntLen + 1);
00350         if (eip->message != NULL) {
00351             sprintf (eip->message, err_null_format, list->num_appearances);
00352         }
00353     } else {
00354         eip->message = (char*)malloc (strlen (err_format)
00355                                       + strlen (list->string)
00356                                       + kMaxPrintedIntLen + 1);
00357         if (eip->message != NULL) {
00358             sprintf (eip->message, err_format, list->string,
00359                      list->num_appearances);
00360         }
00361     }
00362     report_error (eip, report_error_userdata);
00363   
00364     s_ReportDefinitionLines (list->next, report_error, report_error_userdata);
00365 }
00366 
00367   
00368 /* This function creates and sends an error message regarding a line of
00369  * sequence data that was expected to be a different length.
00370  */
00371 static void 
00372 s_ReportLineLengthError 
00373 (char *               id,
00374  TLineInfoPtr         lip,
00375  int                  expected_length,
00376  FReportErrorFunction report_error,
00377  void *               report_error_userdata)
00378 {
00379     TErrorInfoPtr eip;
00380     char *        msg;
00381     const char *  format = "Expected line length %d, actual length %d";
00382     int           len;
00383 
00384     if (lip == NULL  ||  report_error == NULL) {
00385         return;
00386     }
00387 
00388     eip = ErrorInfoNew (NULL);
00389     if (eip == NULL) {
00390         return;
00391     }
00392     eip->category = eAlnErr_BadFormat;
00393     eip->id = strdup (id);
00394     eip->line_num = lip->line_num;
00395     msg = (char *) malloc (strlen (format) + kMaxPrintedIntLen + 1);
00396     if (msg != NULL) {
00397         if (lip->data == NULL) {
00398             len = 0;
00399         } else {
00400             len = strlen (lip->data);
00401         }
00402         sprintf (msg, format, expected_length, len);
00403         eip->message = msg;
00404     }
00405     report_error (eip, report_error_userdata);
00406 }
00407 
00408 
00409 /* This function creates and sends an error message regarding a block of
00410  * sequence data that was expected to contain more lines.
00411  */
00412 static void 
00413 s_ReportBlockLengthError 
00414 (char *               id,
00415  int                  line_num,
00416  int                  expected_num,
00417  int                  actual_num,
00418  FReportErrorFunction report_error,
00419  void *              report_error_userdata)
00420 {
00421     TErrorInfoPtr eip;
00422     const char *  err_format = "Expected %d lines in block, found %d";
00423 
00424     if (report_error == NULL) {
00425         return;
00426     }
00427 
00428     eip = ErrorInfoNew (NULL);
00429     if (eip == NULL) {
00430         return;
00431     }
00432     eip->category = eAlnErr_BadFormat;
00433     eip->id = strdup (id);
00434     eip->line_num = line_num;
00435     eip->message = (char *)malloc (strlen (err_format) + 2 * kMaxPrintedIntLen + 1);
00436     if (eip->message != NULL) {
00437       sprintf (eip->message, err_format, expected_num, actual_num);
00438     }
00439     report_error (eip, report_error_userdata);
00440 }
00441 
00442 
00443 /* This function creates and sends an error message regarding a block of
00444  * sequence data that contains duplicate IDs.
00445  */
00446 static void 
00447 s_ReportDuplicateIDError 
00448 (char *               id,
00449  int                  line_num,
00450  FReportErrorFunction report_error,
00451  void *              report_error_userdata)
00452 {
00453     TErrorInfoPtr eip;
00454     const char *  err_format = "Duplicate ID!  Sequences will be concatenated!";
00455 
00456     if (report_error == NULL) {
00457         return;
00458     }
00459 
00460     eip = ErrorInfoNew (NULL);
00461     if (eip == NULL) {
00462         return;
00463     }
00464     eip->category = eAlnErr_BadData;
00465     eip->id = strdup (id);
00466     eip->line_num = line_num;
00467     eip->message = strdup (err_format);
00468     report_error (eip, report_error_userdata);
00469 }
00470 
00471 
00472 /* This function creates and sends an error message regarding missing
00473  * sequence data.
00474  */
00475 static void
00476 s_ReportMissingSequenceData
00477 (char *               id,
00478  FReportErrorFunction report_error,
00479  void *              report_error_userdata)
00480 {
00481     TErrorInfoPtr eip;
00482 
00483     if (report_error == NULL) {
00484         return;
00485     }
00486     eip = ErrorInfoNew (NULL);
00487     if (eip == NULL) {
00488         return;
00489     }
00490     eip->category = eAlnErr_Fatal;
00491     eip->id = strdup (id);
00492     eip->message = strdup ("No data found");
00493     report_error (eip, report_error_userdata);
00494 }
00495 
00496 
00497 /* This function creates and sends an error message indicating that the
00498  * most common length of the sequences in the file do not match a comment
00499  * found in the file.
00500  */
00501 static void 
00502 s_ReportBadSequenceLength 
00503 (char *               id,
00504  int                  expected_length,
00505  int                  actual_length,
00506  FReportErrorFunction report_error,
00507  void *               report_error_userdata)
00508 {
00509     TErrorInfoPtr eip;
00510     const char *  format_str = "Expected sequence length %d, actual length %d";
00511 
00512     if (report_error == NULL) {
00513         return;
00514     }
00515     eip = ErrorInfoNew (NULL);
00516     if (eip == NULL) {
00517         return;
00518     }
00519     eip->category = eAlnErr_BadFormat;
00520     eip->id = strdup (id);
00521     eip->message = (char *)malloc (strlen (format_str) + 50);
00522     if (eip->message != NULL) {
00523         sprintf (eip->message, format_str, expected_length, actual_length);
00524     }
00525     report_error (eip, report_error_userdata);
00526 }
00527 
00528 
00529 /* This function creates and sends an error message indicating that the
00530  * number of sequences read does not match a comment in the alignment file.
00531  */
00532 static void
00533 s_ReportIncorrectNumberOfSequences
00534 (int                  num_expected,
00535  int                  num_found,
00536  FReportErrorFunction report_error,
00537  void *              report_error_userdata)
00538 {
00539     TErrorInfoPtr eip;
00540     const char *  err_format = "Expected %d sequences, found %d";
00541  
00542     if (report_error == NULL) {
00543         return;
00544     }
00545     eip = ErrorInfoNew (NULL);
00546     if (eip == NULL) {
00547         return;
00548     }
00549     eip->category = eAlnErr_BadFormat;
00550     eip->message = (char *) malloc (strlen (err_format) +
00551                                     2 * kMaxPrintedIntLen + 1);
00552                                      
00553     if (eip->message != NULL)
00554     {
00555         sprintf (eip->message, err_format, num_expected, num_found);
00556     }
00557     report_error (eip, report_error_userdata);
00558 }
00559 
00560 
00561 static void
00562 s_ReportIncorrectSequenceLength 
00563 (int                 len_expected,
00564  int                 len_found,
00565  FReportErrorFunction report_error,
00566  void *             report_error_userdata)
00567 {
00568     TErrorInfoPtr eip;
00569     const char *  err_format = "Expected sequences of length %d, found %d";
00570 
00571     if (report_error == NULL) {
00572         return;
00573     }
00574     eip = ErrorInfoNew (NULL);
00575     if (eip == NULL) {
00576         return;
00577     }
00578 
00579     eip->category = eAlnErr_BadFormat;
00580     eip->message = (char *)malloc (strlen (err_format)
00581                                    + 2 * kMaxPrintedIntLen + 1);
00582     if (eip->message != NULL)
00583     {
00584       sprintf (eip->message, err_format, len_expected, len_found);
00585     }
00586     report_error (eip, report_error_userdata);
00587 }
00588 
00589 
00590 /* This function creates and sends an error message regarding a non-unique
00591  * organism name.
00592  */
00593 static void
00594 s_ReportRepeatedOrganismName
00595 (char *               id,
00596  int                  line_num,
00597  int                  second_line_num,
00598  char *               org_name,
00599  FReportErrorFunction report_error,
00600  void *              report_error_userdata)
00601 {
00602     TErrorInfoPtr eip;
00603     const char *  err_format = "Organism name %s also appears at line %d";
00604 
00605     if (report_error == NULL || org_name == NULL) {
00606         return;
00607     }
00608     eip = ErrorInfoNew (NULL);
00609     if (eip == NULL) {
00610         return;
00611     }
00612     eip->category = eAlnErr_BadData;
00613     eip->line_num = line_num;
00614     if (id != NULL ) {
00615         eip->id = strdup (id);
00616     }
00617     eip->message = (char *)malloc (strlen (err_format) + strlen (org_name)
00618                            + kMaxPrintedIntLen + 1);
00619     if (eip->message != NULL) {
00620         sprintf (eip->message, err_format, org_name, second_line_num);
00621     }
00622     report_error (eip, report_error_userdata);
00623 }
00624 
00625 
00626 /* This function creates and sends an error message indicating that some or
00627  * all of the organism information for the sequences are missing.
00628  */
00629 static void
00630 s_ReportMissingOrganismInfo
00631 (FReportErrorFunction report_error,
00632  void *             report_error_userdata)
00633 {
00634     TErrorInfoPtr eip;
00635 
00636     if (report_error == NULL) {
00637         return;
00638     }
00639     eip = ErrorInfoNew (NULL);
00640     if (eip == NULL) {
00641         return;
00642     }
00643 
00644     eip->category = eAlnErr_BadData;
00645     eip->message = strdup ("Missing organism information");
00646     report_error (eip, report_error_userdata);
00647 }
00648 
00649 
00650 /* This function creates and sends an error message regarding an ID that is
00651  * used for more than one sequence.
00652  */
00653 static void 
00654 s_ReportRepeatedId 
00655 (TStringCountPtr      scp,
00656  FReportErrorFunction report_error,
00657  void *              report_error_userdata)
00658 {
00659     TErrorInfoPtr  eip;
00660     const char *   err_format = "ID %s appears in the following locations:";
00661     char *         cp;
00662     TIntLinkPtr    line_number;
00663 
00664     if (report_error == NULL  ||  scp == NULL  ||  scp->string == NULL) {
00665         return;
00666     }
00667 
00668     eip = ErrorInfoNew (NULL);
00669     if (eip == NULL) {
00670         return;
00671     }
00672 
00673     eip->category = eAlnErr_BadData;
00674     eip->id = strdup (scp->string);
00675     if (scp->line_numbers != NULL) {
00676         eip->line_num = scp->line_numbers->ival;
00677     }
00678     eip->message = (char *) malloc ( strlen (err_format)
00679                                     + strlen (scp->string)
00680                                     + scp->num_appearances * 15
00681                                     + 1);
00682     if (eip->message != NULL) {
00683         sprintf (eip->message, err_format, scp->string);
00684         cp = eip->message + strlen (eip->message);
00685         for (line_number = scp->line_numbers;
00686              line_number != NULL;
00687              line_number = line_number->next) {
00688             sprintf (cp, " %d", line_number->ival);
00689             cp += strlen (cp);
00690         }
00691     }
00692     report_error (eip, report_error_userdata);
00693 }
00694 
00695 
00696 /* This function creates and sends an error message indicating that the file
00697  * being read is an ASN.1 file.
00698  */
00699 static void 
00700 s_ReportASN1Error 
00701 (FReportErrorFunction errfunc,
00702  void *             errdata)
00703 {
00704     TErrorInfoPtr eip;
00705     const char * msg = "This is an ASN.1 file, "
00706         "which cannot be read by this function.";
00707 
00708     if (errfunc == NULL) {
00709         return;
00710     }
00711 
00712     eip = ErrorInfoNew (NULL);
00713     if (eip != NULL) {
00714         eip->category = eAlnErr_BadData;
00715         eip->message = (char *) malloc (strlen (msg) + 1);
00716         if (eip->message != NULL) {
00717             sprintf (eip->message, "%s", msg);
00718         }
00719         errfunc (eip, errdata);
00720     }
00721 }
00722 
00723 
00724 /* This function reports that some sequences are inside brackets (indicating a segmented set)
00725  * and that some sequences are outside the brackets.
00726  */
00727 static void 
00728 s_ReportSegmentedAlignmentError 
00729 (TIntLinkPtr          offset_list,
00730  FReportErrorFunction errfunc,
00731  void *               errdata)
00732 {
00733     TErrorInfoPtr eip;
00734     const char * msg = "This file contains sequences in brackets (indicating "
00735         "a segmented alignment) as well as sequences not in brackets at lines "
00736         "%s.  Please either add or remove brackets to correct this problem.";
00737     int num_lines = 0;
00738     int         msg_len = 0;
00739     TIntLinkPtr t;
00740     char *      line_text_list;
00741     char *      line_text_list_offset;
00742 
00743     if (errfunc == NULL || offset_list == NULL) {
00744         return;
00745     }
00746 
00747     for (t = offset_list; t != NULL; t = t->next)
00748     {
00749         num_lines ++;
00750     }
00751     msg_len = num_lines * (kMaxPrintedIntLen + 2);
00752     if (num_lines > 1) 
00753     {
00754         msg_len += 4;
00755     }
00756     line_text_list = (char *) malloc (msg_len);
00757     if (line_text_list == NULL) return;
00758     line_text_list_offset = line_text_list;
00759     for (t = offset_list; t != NULL; t = t->next)
00760     {
00761         if (t->next == NULL)
00762         {
00763             sprintf (line_text_list_offset, "%d", t->ival);
00764         }
00765         else if (num_lines == 2) 
00766         {
00767             sprintf (line_text_list_offset, "%d and ", t->ival);
00768         }
00769         else if (t->next->next == NULL)
00770         {
00771             sprintf (line_text_list_offset, "%d, and ", t->ival);
00772         }
00773         else
00774         {
00775             sprintf (line_text_list_offset, "%d, ", t->ival);
00776         }
00777         line_text_list_offset += strlen (line_text_list_offset);
00778     }
00779 
00780     msg_len += strlen (msg) + 1;
00781 
00782     eip = ErrorInfoNew (NULL);
00783     if (eip != NULL) {
00784         eip->category = eAlnErr_BadData;
00785         eip->message = (char *) malloc (msg_len);
00786         if (eip->message != NULL) {
00787             sprintf (eip->message, msg, line_text_list);
00788         }
00789         errfunc (eip, errdata);
00790     }
00791     free (line_text_list);
00792 }
00793 
00794 
00795 /* This function reports an error if a line looks like it might contain an organism comment
00796  * but is somehow improperly formatted
00797  */
00798 static void s_ReportOrgCommentError 
00799 (char *               linestring,
00800  FReportErrorFunction errfunc,
00801  void *               errdata)
00802 {
00803     TErrorInfoPtr eip;
00804     const char * msg = "This line may contain an improperly formatted organism description.\n"
00805                        "Organism descriptions should be of the form [org=tax name] or [organism=tax name].\n";
00806     
00807     if (errfunc == NULL || linestring == NULL) {
00808         return;
00809     }
00810                        
00811     eip = ErrorInfoNew (NULL);
00812     if (eip != NULL) {
00813         eip->category = eAlnErr_BadData;
00814         eip->message = (char *) malloc (strlen (msg) + strlen (linestring) + 1);
00815         if (eip->message != NULL) {
00816             strcpy (eip->message, msg);
00817             strcat (eip->message, linestring);
00818         }
00819         errfunc (eip, errdata);
00820     }
00821 }
00822 
00823  
00824 /* This function reports that the number of segments in an alignment of
00825  * segmented sets is inconsistent.
00826  */
00827 static void s_ReportBadNumSegError 
00828 (int                  line_num,
00829  int                  num_seg,
00830  int                  num_seg_exp,
00831  FReportErrorFunction errfunc,
00832  void *               errdata)
00833 {
00834     TErrorInfoPtr eip;
00835     const char * msg = "This segmented set contains a different number of segments (%d) than expected (%d).\n";
00836     
00837     if (errfunc == NULL) {
00838         return;
00839     }
00840                        
00841     eip = ErrorInfoNew (NULL);
00842     if (eip != NULL) {
00843         eip->line_num = line_num;
00844         eip->category = eAlnErr_BadData;
00845         eip->message = (char *) malloc (strlen (msg) + 2 * kMaxPrintedIntLen + 1);
00846         if (eip->message != NULL) {
00847             sprintf (eip->message, msg, num_seg, num_seg_exp);
00848         }
00849         errfunc (eip, errdata);
00850     }
00851 }
00852 
00853  
00854 /* This function allocates memory for a SSequenceInfo structure and
00855  * initializes the member variables.  It returns a pointer to the newly
00856  * allocated memory.
00857  */
00858 extern TSequenceInfoPtr SequenceInfoNew (void)
00859 {
00860     TSequenceInfoPtr sip;
00861 
00862     sip = (TSequenceInfoPtr) malloc (sizeof (SSequenceInfo));
00863     if (sip == NULL) {
00864         return NULL;
00865     }
00866     sip->missing       = strdup ("?");
00867     sip->beginning_gap = strdup (".");
00868     sip->middle_gap    = strdup ("-");
00869     sip->end_gap       = strdup (".");
00870     sip->match         = strdup (".");
00871     sip->alphabet      = NULL;
00872     return sip;
00873 }
00874 
00875 
00876 /* This function frees memory associated with the member variables of
00877  * the SSequenceInfo structure and with the structure itself.
00878  */
00879 extern void SequenceInfoFree (TSequenceInfoPtr sip)
00880 {
00881     if (sip == NULL) {
00882         return;
00883     }
00884     free (sip->missing);
00885     free (sip->beginning_gap);
00886     free (sip->middle_gap);
00887     free (sip->end_gap);
00888     free (sip->match);
00889     sip->alphabet = NULL;
00890     free (sip);
00891 }
00892 
00893 
00894 /* This function creates and sends an error message regarding an unused line.
00895  */
00896 static void 
00897 s_ReportUnusedLine
00898 (int                  line_num_start,
00899  int                  line_num_stop,
00900  TLineInfoPtr         line_val,
00901  FReportErrorFunction errfunc,
00902  void *               errdata)
00903 {
00904     TErrorInfoPtr eip;
00905     const char * errformat1 = "Line %d could not be assigned to an interleaved block";
00906     const char * errformat2 = "Lines %d through %d could not be assigned to an interleaved block";
00907     const char * errformat3 = "Contents of unused line: %s";
00908     int skip;
00909 
00910     if (errfunc == NULL  ||  line_val == NULL) {
00911         return;
00912     }
00913 
00914     eip = ErrorInfoNew (NULL);
00915     if (eip != NULL) {
00916         eip->category = eAlnErr_BadFormat;
00917         eip->line_num = line_num_start;
00918         if (line_num_start == line_num_stop) {
00919               eip->message = (char *) malloc (strlen (errformat1)
00920                                             + kMaxPrintedIntLen + 1);
00921             if (eip->message != NULL) {
00922                 sprintf (eip->message, errformat1, line_num_start);
00923             }
00924         } else {
00925             eip->message = (char *) malloc (strlen (errformat2)
00926                                             + 2 * kMaxPrintedIntLen + 1);
00927             if (eip->message != NULL) {
00928                 sprintf (eip->message, errformat2, line_num_start,
00929                          line_num_stop);
00930             }
00931         }
00932         errfunc (eip, errdata);
00933     }
00934     /* report contents of unused lines */
00935     for (skip = line_num_start;
00936          skip < line_num_stop + 1  &&  line_val != NULL;
00937          skip++) {
00938         if (line_val->data == NULL) {
00939             continue;
00940         }
00941         eip = ErrorInfoNew (NULL);
00942         if (eip != NULL) {
00943             eip->category = eAlnErr_BadFormat;
00944             eip->line_num = skip;
00945             eip->message = (char *) malloc (strlen (errformat3)
00946                                             + strlen (line_val->data) + 1);
00947             if (eip->message != NULL) {
00948                 sprintf (eip->message, errformat3, line_val->data);
00949             }
00950             errfunc (eip, errdata);
00951         }
00952         line_val = line_val->next;
00953     }
00954 }
00955 
00956 
00957 /* The following functions are used to manage a linked list of integer
00958  * values.
00959  */
00960 
00961 /* This function creates a new SIntLink structure with a value of ival.
00962  * The new structure will be placed at the end of list if list is not NULL.
00963  * The function will return a pointer to the new structure.
00964  */
00965 static TIntLinkPtr 
00966 s_IntLinkNew 
00967 (int         ival, 
00968  TIntLinkPtr list)
00969 {
00970     TIntLinkPtr ilp, last;
00971 
00972     ilp = (TIntLinkPtr) malloc (sizeof (SIntLink));
00973     if (ilp == NULL) {
00974         return NULL;
00975     }
00976     ilp->ival = ival;
00977     ilp->next = NULL;
00978     last = list;
00979     while (last != NULL && last->next != NULL) {
00980         last = last->next;
00981     }
00982     if (last != NULL) {
00983         last->next = ilp;
00984     }
00985     return ilp;
00986 }
00987 
00988 
00989 /* This function recursively frees memory associated with a linked list
00990  * of SIntLink structures.
00991  */
00992 static void s_IntLinkFree (TIntLinkPtr ilp)
00993 {
00994     if (ilp == NULL) {
00995         return;
00996     }
00997     s_IntLinkFree (ilp->next);
00998     free (ilp);
00999 }
01000 
01001 
01002 /* These functions are used to accumulate and retrieve information on 
01003  * how often a size of data (number of lines or number of characters) occurs.
01004  */
01005 
01006 /* This function allocates space for a new SSizeInfo structure and 
01007  * initializes its member variables.  If list is not NULL, the new structure
01008  * is added to the end of the list.
01009  * The function returns a pointer to the newly allocated structure.
01010  */
01011 static TSizeInfoPtr s_SizeInfoNew (TSizeInfoPtr list)
01012 {
01013     TSizeInfoPtr sip, last;
01014 
01015     sip = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
01016     if (sip == NULL) {
01017         return NULL;
01018     }
01019 
01020     sip->size_value      = 0;
01021     sip->num_appearances = 0;
01022     sip->next            = NULL;
01023     last = list;
01024     while (last != NULL && last->next != NULL) {
01025         last = last->next;
01026     }
01027     if (last != NULL) {
01028         last->next = sip;
01029     }
01030     return sip;
01031 }
01032 
01033 
01034 /* This function recursively frees the memory associated with a linked list
01035  * of SSizeInfo structures.
01036  */
01037 static void s_SizeInfoFree (TSizeInfoPtr list)
01038 {
01039     if (list == NULL) {
01040         return;
01041     }
01042     s_SizeInfoFree (list->next);
01043     list->next = NULL;
01044     free (list);
01045 }
01046 
01047 
01048 /* This function returns eTrue if the two SSizeInfo structures have
01049  * the same size_value and number of appearances, eFalse otherwise.
01050  */
01051 static EBool 
01052 s_SizeInfoIsEqual 
01053 (TSizeInfoPtr s1,
01054  TSizeInfoPtr s2)
01055 {
01056     if (s1 == NULL
01057       ||  s2 == NULL
01058       ||  s1->size_value != s2->size_value
01059       ||  s1->num_appearances != s2->num_appearances) {
01060         return eFalse;
01061     }
01062     return eTrue;
01063 }
01064 
01065 
01066 /* This function searches list for a SSizeInfo structure with the
01067  * same size_value as size_value.  If it finds such a structure, it
01068  * adds the value of num_appearances to the num_appearances for that
01069  * structure, otherwise the function creates a new structure at the end
01070  * of the list with the specified values of size_value and num_appearances.
01071  * The function returns a pointer to the list of SSizeInfo structures.
01072  */
01073 static TSizeInfoPtr s_AddSizeInfoAppearances 
01074 (TSizeInfoPtr list,
01075  int  size_value,
01076  int  num_appearances)
01077 {
01078     TSizeInfoPtr p, last;
01079 
01080     last = NULL;
01081     for (p = list;  p != NULL  &&  p->size_value != size_value;  p = p->next) {
01082         last = p;
01083     }
01084     if (p == NULL) {
01085         p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
01086         if (p == NULL) {
01087             return NULL;
01088         }
01089         p->size_value = size_value;
01090         p->num_appearances = num_appearances;
01091         p->next = 0;
01092         if (last == NULL) {
01093             list = p;
01094         } else {
01095             last->next = p;
01096         }
01097     } else {
01098         p->num_appearances += num_appearances;
01099     }
01100     return list;
01101 }
01102 
01103 
01104 /* This function searches list for a SSizeInfo structure with the
01105  * same size_value as size_value.  If it finds such a structure, it
01106  * adds one to the num_appearances for that structure, otherwise the 
01107  * function creates a new structure at the end of the list with the 
01108  * specified values of size_value and num_appearances.
01109  * The function returns a pointer to the list of SSizeInfo structures.
01110  */
01111 static TSizeInfoPtr 
01112 s_AddSizeInfo
01113 (TSizeInfoPtr list,
01114  int  size_value)
01115 {
01116     return s_AddSizeInfoAppearances (list, size_value, 1);
01117 }
01118 
01119 
01120 /* This function searches list for the SSizeInfo structure with the
01121  * highest value for num_appearances.  If more than one structure exists
01122  * with the highest value for num_appearances, the function chooses the
01123  * value with the highest value for size_value.  The function returns a
01124  * pointer to the structure selected based on the above criteria.
01125  */
01126 static TSizeInfoPtr s_GetMostPopularSizeInfo (TSizeInfoPtr list)
01127 {
01128     TSizeInfoPtr p, best;
01129 
01130     if (list == NULL) {
01131         return NULL;
01132     }
01133 
01134     best = list;
01135     for (p = list->next;  p != NULL;  p = p->next) {
01136       if (p->num_appearances > best->num_appearances
01137           ||  (p->num_appearances == best->num_appearances
01138             &&  p->size_value > best->size_value)) {
01139           best = p;
01140       }
01141     }
01142     return best;
01143 }
01144 
01145 
01146 /* This function uses s_GetMostPopularSizeInfo function to find the structure
01147  * in list that has the highest value for num_appearances and size_value.
01148  * If such a structure is found and has a num_appearances value greater than
01149  * one, the size_value for that structure will be returned, otherwise the
01150  * function returns 0.
01151  */
01152 static int  s_GetMostPopularSize (TSizeInfoPtr list)
01153 {
01154     TSizeInfoPtr best;
01155 
01156     best = s_GetMostPopularSizeInfo (list);
01157     if (best == NULL) {
01158         return 0;
01159     }
01160     if (best->num_appearances > 1) {
01161         return best->size_value; 
01162     } else {
01163         return 0;
01164     }
01165 }
01166 
01167 
01168 /* The following functions are used to keep track of patterns of line or
01169  * token lengths, which will be used to identify errors in formatting.
01170  */
01171 static SLengthListPtr s_LengthListNew (SLengthListPtr list)
01172 {
01173     SLengthListPtr llp, last;
01174 
01175     llp = (SLengthListPtr) malloc (sizeof (SLengthListData));
01176     if (llp == NULL) {
01177         return NULL;
01178     }
01179 
01180     llp->lengthrepeats   = NULL;
01181     llp->num_appearances = 0;
01182     llp->next            = NULL;
01183 
01184     last = list;
01185     while (last != NULL && last->next != NULL) {
01186         last = last->next;
01187     }
01188     if (last != NULL) {
01189         last->next = llp;
01190     }
01191     return llp;
01192 }
01193 
01194 
01195 /* This function recursively frees memory for a list of SLengthListData
01196  * structures and its member variables.
01197  */
01198 static void s_LengthListFree (SLengthListPtr llp)
01199 {
01200     if (llp == NULL) {
01201         return;
01202     }
01203     s_LengthListFree (llp->next);
01204     s_SizeInfoFree (llp->lengthrepeats);
01205     free (llp);
01206 }
01207 
01208 
01209 /* This function examines the last SSizeInfo structure in the 
01210  * lengthrepeats member variable of llp.  If the last structure 
01211  * in the list has the same size_value value as the function argument 
01212  * size_value, the value of num_appearances for that SizeInforData structure
01213  * will be incremented.  Otherwise a new SSizeInfo structure will be
01214  * appended to the end of the lengthrepeats list with the specified
01215  * size_value and a num_appearances value of 1.
01216  */
01217 static void 
01218 s_AddLengthRepeat
01219 (SLengthListPtr llp,
01220  int  size_value)
01221 {
01222     TSizeInfoPtr p, last;
01223 
01224     if (llp == NULL) {
01225         return;
01226     }
01227 
01228     last = NULL;
01229     for (p = llp->lengthrepeats;  p != NULL;  p = p->next) {
01230         last = p;
01231     }
01232     if (last == NULL  ||  last->size_value != size_value) {
01233         p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
01234         if (p == NULL) {
01235             return;
01236         }
01237         p->size_value = size_value;
01238         p->num_appearances = 1;
01239         p->next = 0;
01240         if (last == NULL) {
01241             llp->lengthrepeats = p;
01242         } else {
01243             last->next = p;
01244         }
01245     } else {
01246         last->num_appearances ++;
01247     }
01248 }
01249 
01250 
01251 /* This function examines whether two SLengthListData structures "match" - 
01252  * the structures match if each SSizeInfo structure in llp1->lengthrepeats
01253  * has the same size_value and num_appearances values as the SSizeInfo
01254  * structure in the corresponding list position in llp2->lenghrepeats.
01255  * If the two structures match, the function returns eTrue, otherwise the
01256  * function returns eFalse.
01257  */
01258 static EBool 
01259 s_DoLengthPatternsMatch 
01260 (SLengthListPtr llp1,
01261  SLengthListPtr llp2)
01262 {
01263     TSizeInfoPtr sip1, sip2;
01264 
01265     if (llp1 == NULL  ||  llp2 == NULL
01266         ||  llp1->lengthrepeats == NULL
01267         ||  llp2->lengthrepeats == NULL) {
01268         return eFalse;
01269     }
01270     for (sip1 = llp1->lengthrepeats, sip2 = llp2->lengthrepeats;
01271          sip1 != NULL  &&  sip2 != NULL;
01272          sip1 = sip1->next, sip2 = sip2->next) {
01273         if ( ! s_SizeInfoIsEqual (sip1, sip2)
01274           ||  (sip1->next == NULL  &&  sip2->next != NULL)
01275           ||  (sip1->next != NULL  &&  sip2->next == NULL)) {
01276             return eFalse;
01277         }
01278     }
01279     return eTrue;
01280 }
01281 
01282 
01283 /* This function examines a list of SLengthListData structures to see if
01284  * one of them matches llp.  If so, the value of num_appearances in that
01285  * list is incremented by one and llp is freed, otherwise llp is added
01286  * to the end of the list.
01287  * The function returns a pointer to the list of LenghtListData structures.
01288  */
01289 static SLengthListPtr
01290 s_AddLengthList
01291 (SLengthListPtr list,
01292  SLengthListPtr llp)
01293 {
01294     SLengthListPtr prev_llp;
01295 
01296     if (list == NULL) {
01297         list = llp;
01298     } else {
01299         prev_llp = list;
01300         while ( prev_llp->next  &&  ! s_DoLengthPatternsMatch (prev_llp, llp)) {
01301             prev_llp = prev_llp->next;
01302         }
01303         if (s_DoLengthPatternsMatch (prev_llp, llp)) {
01304             prev_llp->num_appearances ++;
01305             s_LengthListFree (llp);
01306         } else {
01307             prev_llp->next = llp;
01308         }
01309     }
01310     return list;
01311 }
01312 
01313 
01314 /* This set of functions is used for storing and analyzing individual lines
01315  * or tokens from an alignment file.
01316  */
01317 
01318 /* This function allocates memory for a new SLineInfo structure and
01319  * initializes the structure with a saved copy of string and the specified
01320  * values of line_num and line_offset.
01321  * The function returns a pointer to the new SLineInfo structure.
01322  */
01323 static TLineInfoPtr
01324 s_LineInfoNew
01325 (char * string,
01326  int    line_num,
01327  int    line_offset)
01328 {
01329     TLineInfoPtr lip;
01330 
01331     lip = (TLineInfoPtr) malloc (sizeof (SLineInfo));
01332     if (lip == NULL) {
01333         return NULL;
01334     }
01335     lip->data = strdup (string);
01336     lip->line_num = line_num;
01337     lip->line_offset = line_offset;
01338     lip->delete_me = eFalse;
01339     lip->next = NULL;
01340     return lip;
01341 }
01342 
01343 
01344 /* This function recursively frees the memory associated with the structures
01345  * and members of the structures in a linked list of SLineInfo structures.
01346  */
01347 static void s_LineInfoFree (TLineInfoPtr lip)
01348 {
01349     TLineInfoPtr next_lip;
01350     if (lip == NULL) {
01351         return;
01352     }
01353     while (lip != NULL) {
01354         next_lip = lip->next;
01355         lip->next = NULL;
01356         free (lip->data);
01357         free (lip);
01358         lip = next_lip; 
01359     }
01360 }
01361 
01362 
01363 /* This function deletes from a linked list of SLineInfo structures
01364  * those structures for which the delete_me flag has been set.  The function
01365  * returns a pointer to the beginning of the new list.
01366  */
01367 static TLineInfoPtr s_DeleteLineInfos (TLineInfoPtr list)
01368 {
01369     TLineInfoPtr prev = NULL;
01370     TLineInfoPtr lip, nextlip;
01371 
01372     lip = list;
01373     while (lip != NULL) {
01374         nextlip = lip->next;
01375         if (lip->delete_me) {
01376             if (prev != NULL) {
01377                 prev->next = lip->next;
01378             } else {
01379                 list = lip->next;
01380             }
01381             lip->next = NULL;
01382             s_LineInfoFree (lip);
01383         } else {
01384             prev = lip;
01385         }
01386         lip = nextlip;
01387     }
01388     return list;
01389 }
01390      
01391  
01392 /* This function creates a new SLineInfo structure, populates it with
01393  * a copy of string and the specified line_num and line_offset values,
01394  * and appends it to the end of "list" if list is not NULL.
01395  * The function will return a pointer to the newly created structure
01396  * if list is NULL, otherwise the function will return list.
01397  */
01398 static TLineInfoPtr 
01399 s_AddLineInfo
01400 (TLineInfoPtr list,
01401  char * string,
01402  int    line_num,
01403  int    line_offset)
01404 {
01405     TLineInfoPtr lip, p;
01406 
01407     if (string == NULL) {
01408         return list;
01409     }
01410     lip = s_LineInfoNew (string, line_num, line_offset);
01411     if (lip == NULL) {
01412         return NULL;
01413     }
01414     if (list == NULL) {
01415         list = lip;
01416     } else {
01417         p = list;
01418         while (p != NULL  &&  p->next != NULL) {
01419             p = p->next;
01420         }
01421         p->next = lip;
01422     }
01423     return list;
01424 }
01425 
01426 /* This function creates a new bracketed comment */
01427 static TBracketedCommentListPtr s_BracketedCommentListNew 
01428 (TBracketedCommentListPtr list,
01429  char * string,
01430  int    line_num,
01431  int    line_offset)
01432 {
01433     TBracketedCommentListPtr comment;
01434         
01435     comment = (TBracketedCommentListPtr) malloc (sizeof (SBracketedCommentList));
01436     if (comment == NULL) {
01437         return NULL;
01438     }
01439     comment->comment_lines = s_LineInfoNew (string, line_num, line_offset);
01440     comment->next = NULL;
01441     
01442     if (list != NULL) {
01443         while (list->next != NULL) {
01444             list = list->next;
01445         }
01446         list->next = comment;
01447     }
01448     
01449     return comment;
01450 }
01451 
01452 /* This function frees a bracketed comment list. */
01453 static void s_BracketedCommentListFree (TBracketedCommentListPtr list)
01454 {
01455     if (list == NULL) {
01456         return;
01457     }
01458     s_BracketedCommentListFree (list->next);
01459     list->next = NULL;
01460     s_LineInfoFree (list->comment_lines);
01461 }
01462 
01463 /* This function adds a line to a bracketed comment. */
01464 static void s_BracketedCommentListAddLine 
01465 (TBracketedCommentListPtr comment,
01466  char                   * string,
01467  int                      line_num,
01468  int                      line_offset)
01469 {
01470     if (comment == NULL) {
01471         return;
01472     }
01473 
01474     comment->comment_lines = s_AddLineInfo (comment->comment_lines, string, line_num, line_offset);
01475 }
01476 
01477 /* This function counts the sequences found in a bracketed comment. */
01478 static int s_CountSequencesInBracketedComment (TBracketedCommentListPtr comment)
01479 {
01480     TLineInfoPtr lip;
01481     int          num_segments = 0;
01482     EBool        skipped_line_since_last_defline = eTrue;
01483     
01484     if (comment == NULL || comment->comment_lines == NULL) {
01485         return 0;
01486     }
01487     
01488     lip = comment->comment_lines;
01489     /* First line must be left bracket on a line by itself */
01490     if (lip->data[0] != '[' || strspn (lip->data + 1, " \t\r\n") != strlen (lip->data + 1))
01491     {
01492         return 0;
01493     }
01494     lip = lip->next;
01495     while (lip != NULL && lip->next != NULL)
01496     {
01497         if (lip->data[0] == '>')
01498         {
01499             if (!skipped_line_since_last_defline) 
01500             {
01501                 return 0;
01502             }
01503             else
01504             {
01505                 num_segments ++;
01506                 skipped_line_since_last_defline = eFalse;
01507             }
01508         }
01509         else 
01510         {
01511             skipped_line_since_last_defline = eTrue;
01512         }
01513         lip = lip->next;
01514     }
01515     /* Last line must be right bracket on a line by itself */
01516     /* First line must be left bracket on a line by itself */
01517     if (lip->data[0] != ']' || strspn (lip->data + 1, " \t\r\n") != strlen (lip->data + 1))
01518     {
01519         return 0;
01520     }
01521     
01522     return num_segments;
01523 }
01524 
01525 /* This function counts the number of sequences that appear in
01526  * bracketed comments.  If the number of sequences is inconsistent,
01527  * the function will issue error messages and return a 1, otherwise
01528  * the function will return the number of sequences that appear in
01529  * each bracketed comment.
01530  */
01531 static int s_GetNumSegmentsInAlignment 
01532 (TBracketedCommentListPtr comment_list,
01533  FReportErrorFunction     errfunc,
01534  void *                   errdata)
01535 {
01536     TBracketedCommentListPtr comment;
01537     TSizeInfoPtr             segcount_list = NULL;
01538     int                      num_segments = 1;
01539     int                      num_segments_this_bracket;
01540     int                      num_segments_expected;
01541     TSizeInfoPtr             best;
01542     
01543     if (comment_list == NULL)
01544     {
01545         return num_segments;
01546     }
01547     
01548     for (comment = comment_list; comment != NULL; comment = comment->next)
01549     {
01550         num_segments_this_bracket = s_CountSequencesInBracketedComment (comment);
01551         segcount_list = s_AddSizeInfoAppearances (segcount_list,
01552                                                   num_segments_this_bracket,
01553                                                   1);
01554         if (comment != comment_list && segcount_list->next != NULL)
01555         {
01556             best = s_GetMostPopularSizeInfo (segcount_list);
01557             num_segments_expected = best->size_value;
01558 
01559             if (num_segments_expected != num_segments_this_bracket)
01560             {
01561                 s_ReportBadNumSegError (comment->comment_lines->line_num,
01562                                         num_segments_this_bracket, num_segments_expected,
01563                                         errfunc, errdata);
01564             }
01565         }
01566     }
01567     if (segcount_list != NULL && segcount_list->next == NULL && segcount_list->size_value > 0)
01568     {
01569         num_segments = segcount_list->size_value;
01570     }
01571     s_SizeInfoFree (segcount_list);
01572     return num_segments;
01573 }
01574 
01575 /* This function gets a list of the offsets of the 
01576  * sequences in bracketed comments.
01577  */
01578 static TIntLinkPtr GetSegmentOffsetList (TBracketedCommentListPtr comment_list)
01579 {
01580     TIntLinkPtr              new_offset, offset_list = NULL;
01581     TBracketedCommentListPtr comment;
01582     TLineInfoPtr             lip;
01583 
01584     if (comment_list == NULL) 
01585     {
01586         return NULL;
01587     }
01588     
01589     for (comment = comment_list; comment != NULL; comment = comment->next)
01590     {
01591         if (s_CountSequencesInBracketedComment (comment) == 0) 
01592         {
01593             continue;
01594         }
01595         for (lip = comment->comment_lines; lip != NULL; lip = lip->next)
01596         {
01597             if (lip->data != NULL && lip->data[0] == '>') 
01598             {
01599                 new_offset = s_IntLinkNew (lip->line_num + 1, offset_list);
01600                 if (offset_list == NULL) offset_list = new_offset;
01601             }
01602         }
01603     }
01604     return offset_list;
01605 }
01606 
01607 static char * s_TokenizeString (char * str, char *delimiter, char **last)
01608 {
01609     int skip;
01610     int length;
01611 
01612     if (str == NULL) {
01613         str = *last;
01614     }
01615     if (delimiter == NULL) {
01616         *last = NULL;
01617         return NULL;
01618     }
01619 
01620     if (str == NULL || *str == 0) {
01621         return NULL;
01622     }
01623     skip = strspn (str, delimiter);
01624     str += skip;
01625     length = strcspn (str, delimiter);
01626     *last = str + length;
01627     if (**last != 0) {
01628         **last = 0;
01629         (*last) ++;
01630     }
01631     return str;
01632 }
01633   
01634 
01635 /* This function creates a new list of SLineInfo structures by tokenizing
01636  * each data element from line_list into multiple tokens at whitespace.
01637  * The function returns a pointer to the new list.  The original list is
01638  * unchanged.
01639  */
01640 static TLineInfoPtr s_BuildTokenList (TLineInfoPtr line_list)
01641 {
01642     TLineInfoPtr first_token, lip;
01643     char *       tmp;
01644     char *       piece;
01645     char *       last;
01646     int          line_pos;
01647 
01648     first_token = NULL;
01649 
01650     for (lip = line_list;  lip != NULL;  lip = lip->next) {
01651         if (lip->data != NULL  &&  (tmp = strdup (lip->data)) != NULL) {
01652             piece = s_TokenizeString (tmp, " \t\r", &last);
01653             while (piece != NULL) {
01654                 line_pos = piece - tmp;
01655                 line_pos += lip->line_offset;
01656                 first_token = s_AddLineInfo (first_token, piece, 
01657                                              lip->line_num,
01658                                              line_pos);
01659                 piece = s_TokenizeString (NULL, " \t\r", &last);
01660             }
01661             free (tmp);
01662         }
01663     }
01664     return first_token;
01665 }
01666 
01667 
01668 /* This function takes a list of SLineInfo structures, allocates memory
01669  * to hold their contents contiguously, and stores their contents, minus
01670  * the whitespace, in the newly allocated memory.
01671  * The function returns a pointer to this newly allocated memory.
01672  */
01673 static char * s_LineInfoMergeAndStripSpaces (TLineInfoPtr list)
01674 {
01675     TLineInfoPtr lip;
01676     int          len;
01677     char *       result;
01678     char *       cp_to;
01679     char *       cp_from;
01680 
01681     if (list == NULL) {
01682         return NULL;
01683     }
01684     len = 0;
01685     for (lip = list;  lip != NULL;  lip = lip->next) {
01686         if (lip->data != NULL) {
01687             len += strlen (lip->data);
01688         }
01689     }
01690     result = (char *) malloc (len + 1);
01691     if (result == NULL) {
01692         return result;
01693     }
01694     cp_to = result;
01695     for (lip = list;  lip != NULL;  lip = lip->next) {
01696         if (lip->data != NULL) {
01697             cp_from = lip->data;
01698             while (*cp_from != 0) {
01699                 if (! isspace ((unsigned char)*cp_from)) {
01700                     *cp_to = *cp_from;
01701                     cp_to ++;
01702                 }
01703                 cp_from ++;
01704             }
01705         }
01706     }
01707     *cp_to = 0;
01708     return result;
01709 }
01710 
01711 
01712 /* The following functions are used to manage the SLineInfoReader
01713  * structure.  The intention is to allow the user to access the data
01714  * from a linked list of SLineInfo structures using a given position
01715  * in the data based on the number of sequence data characters rather than 
01716  * any particular line number or position in the line.  This is useful
01717  * for matching up a data position in a record with a match character with
01718  * the same data position in the first or master record.  This is also useful
01719  * for determining how to interpret special characters that may have
01720  * context-sensitive meanings.  For example, a ? could indicate a missing 
01721  * character if it is inside a sequence but indicate a gap if it is outside
01722  * a sequence.
01723  */
01724 
01725 /* This function is used to advance the current data position pointer
01726  * for a SLineInfoReader structure past white space and blank lines
01727  * in sequence data.
01728  */
01729 static void s_LineInfoReaderAdvancePastSpace (TLineInfoReaderPtr lirp)
01730 {
01731     if (lirp->curr_line_pos == NULL) {
01732         return;
01733     }
01734     while ( isspace ((unsigned char) *lirp->curr_line_pos)
01735            ||  *lirp->curr_line_pos == 0) {
01736         while ( isspace ((unsigned char)*lirp->curr_line_pos)) {
01737             lirp->curr_line_pos ++;
01738         }
01739         if (*lirp->curr_line_pos == 0) {
01740             lirp->curr_line = lirp->curr_line->next;
01741             while (lirp->curr_line != NULL
01742                    &&  lirp->curr_line->data == NULL) {
01743                 lirp->curr_line = lirp->curr_line->next;
01744             }
01745             if (lirp->curr_line == NULL) {
01746                 lirp->curr_line_pos = NULL;
01747                 return;
01748             } else {
01749                 lirp->curr_line_pos = lirp->curr_line->data;
01750             }
01751         }
01752     }
01753 }
01754 
01755 
01756 /* This function sets the current data position pointer to the first
01757  * non-whitespace character in the sequence data.
01758  */
01759 static void s_LineInfoReaderReset (TLineInfoReaderPtr lirp)
01760 {
01761     if (lirp == NULL) {
01762         return;
01763     }
01764     lirp->curr_line = lirp->first_line;
01765 
01766     while (lirp->curr_line != NULL  &&  lirp->curr_line->data == NULL) {
01767         lirp->curr_line = lirp->curr_line->next;
01768     }
01769     if (lirp->curr_line == NULL) {
01770         lirp->curr_line_pos = NULL;
01771         lirp->data_pos = -1;
01772     } else {
01773         lirp->curr_line_pos = lirp->curr_line->data;
01774         s_LineInfoReaderAdvancePastSpace (lirp);
01775         if (lirp->curr_line_pos == NULL) {
01776             lirp->data_pos = -1;
01777         } else {
01778             lirp->data_pos = 0;
01779         }
01780     }
01781 }
01782 
01783  
01784 /* This function creates a new SLineInfoReader structure and initializes
01785  * its member variables.  The current data position pointer is set to the
01786  * first non-whitespace character in the sequence data, and the data position
01787  * counter is set to zero.  The function returns a pointer to the new
01788  * LineInfoReader data structure.
01789  */
01790 static TLineInfoReaderPtr s_LineInfoReaderNew (TLineInfoPtr line_list)
01791 {
01792     TLineInfoReaderPtr lirp;
01793 
01794     if (line_list == NULL) {
01795         return NULL;
01796     }
01797     lirp = (TLineInfoReaderPtr) malloc (sizeof (SLineInfoReader));
01798     if (lirp == NULL) {
01799         return NULL;
01800     }
01801 
01802     lirp->first_line = line_list;
01803     s_LineInfoReaderReset (lirp);
01804     return lirp;
01805 }
01806 
01807 
01808 /* This function safely interprets the current line number of the
01809  * SLineInfoReader structure.  If the structure is NULL or the
01810  * current line is NULL (usually because the data position has been
01811  * advanced to the end of the available sequence data), the function
01812  * returns -1, since the current data position does not actually exist.
01813  * Otherwise, the line number of the character at the current data position
01814  * is returned.
01815  */
01816 static int  s_LineInfoReaderGetCurrentLineNumber (TLineInfoReaderPtr lirp)
01817 {
01818     if (lirp == NULL  ||  lirp->curr_line == NULL) {
01819         return -1;
01820     } else {
01821         return lirp->curr_line->line_num;
01822     }
01823 }
01824 
01825 
01826 /* This function safely interprets the position of the current data position
01827  * of the SLineInfoReader structure.  If the structure is NULL or the
01828  * current line is NULL or the current line position is NULL (usually because
01829  * the data position has been advanced to the end of the available sequence
01830  * data), the function returns -1, since the current data position does not 
01831  * actually exist.
01832  * Otherwise, the position within the line of the character at the current 
01833  * data position is returned.
01834  */
01835 static int  s_LineInfoReaderGetCurrentLineOffset (TLineInfoReaderPtr lirp)
01836 {
01837     if (lirp == NULL  ||  lirp->curr_line == NULL 
01838         ||  lirp->curr_line_pos == NULL) {
01839         return -1;
01840     } else {
01841         return lirp->curr_line->line_offset + lirp->curr_line_pos 
01842                      - lirp->curr_line->data;
01843     }
01844 }
01845 
01846 
01847 /* This function frees the memory associated with the SLineInfoReader
01848  * structure.  Notice that this function does NOT free the SLineInfo list.
01849  * This is by design.
01850  */
01851 static void s_LineInfoReaderFree (TLineInfoReaderPtr lirp)
01852 {
01853     if (lirp == NULL) {
01854         return;
01855     }
01856     free (lirp);
01857     lirp = NULL;
01858 }
01859 
01860 
01861 /* This function retrieves the "pos"th sequence data character from the lines
01862  * of sequence data.  If the data position requested is greater than the
01863  * current position, the current data pointer will be advanced until the
01864  * current position is the requested position or there is no more data.  If
01865  * there is no more data, the function returns a 0.  If the data position
01866  * requested is lower than the current position, the current position is reset
01867  * to the beginning of the sequence and advanced from there.
01868  * As a result, it is clearly more efficient to read the data in the forward
01869  * direction, but it is still possible to access the data randomly.
01870  */
01871 static char 
01872 s_FindNthDataChar
01873 (TLineInfoReaderPtr lirp,
01874  int  pos)
01875 {
01876     if (lirp == NULL  ||  lirp->first_line == NULL  ||  pos < 0
01877         ||  lirp->data_pos == -1) {
01878         return 0;
01879     }
01880 
01881     if (lirp->data_pos == pos) {
01882         if (lirp->curr_line_pos == NULL) {
01883             return 0;
01884         } else {
01885             return *lirp->curr_line_pos;
01886         }
01887     }
01888     if (lirp->data_pos > pos) {
01889         s_LineInfoReaderReset (lirp);
01890     }
01891      
01892     while (lirp->data_pos < pos  &&  lirp->curr_line != NULL) {
01893         lirp->curr_line_pos ++;
01894         /* skip over spaces, progress to next line if necessary */
01895         s_LineInfoReaderAdvancePastSpace (lirp);
01896         lirp->data_pos ++;
01897     }
01898     if (lirp->curr_line_pos != NULL) {
01899         return *lirp->curr_line_pos;
01900     } else {
01901         return 0;
01902     }
01903 }
01904 
01905 
01906 /* The following functions are used to manage the SStringCount structure.
01907  * These functions are useful for determining whether a string is unique
01908  * or whether only one string is used for a particular purpose.
01909  * The structure also tracks the line numbers on which a particular string
01910  * appeared.
01911  */
01912 
01913 /* This function allocates memory for a new SStringCount structure,
01914  * initializes its member variables.  The function also places the 
01915  * structure at the end of list if list is not NULL.
01916  * The function returns a pointer to the newly allocated SStringCount
01917  * structure.
01918  */
01919 static TStringCountPtr s_StringCountNew (TStringCountPtr list)
01920 {
01921     TStringCountPtr new_item, last;
01922 
01923     new_item = (TStringCountPtr) malloc (sizeof (SStringCount));
01924     if (new_item == NULL) {
01925         return NULL;
01926     }
01927     new_item->string          = NULL;
01928     new_item->num_appearances = 0;
01929     new_item->line_numbers    = NULL;
01930     new_item->next            = NULL;
01931 
01932     last = list;
01933     while (last != NULL && last->next != NULL) {
01934         last = last->next;
01935     }
01936     if (last != NULL) {
01937         last->next = new_item;
01938     }
01939     return new_item;
01940 }
01941 
01942 
01943 /* This function recursively frees data associated with the structures
01944  * and structure member variables in a linked list of SStringCount
01945  * structures.
01946  */
01947 static void s_StringCountFree (TStringCountPtr list)
01948 {
01949     if (list == NULL) {
01950         return;
01951     }
01952     s_StringCountFree (list->next);
01953     s_IntLinkFree (list->line_numbers);
01954     free (list);
01955 }
01956 
01957 
01958 /* This function searches list to see if the string matches any of the
01959  * existing entries.  If so, the num_appearances value for that entry is
01960  * increased and the line_num is added to that entry's list of line numbers.
01961  * Otherwise a new entry is created at the end of the list.
01962  * The function returns list if list was not NULL, or a pointer to the
01963  * newly created SStringCount structure otherwise.
01964  */
01965 static TStringCountPtr s_AddStringCount (
01966   char *          string,
01967   int             line_num,
01968   TStringCountPtr list
01969 )
01970 {
01971     TStringCountPtr  add_to, last;
01972     TIntLinkPtr      new_offset;
01973 
01974     if (string == NULL) {
01975         for (add_to = list;
01976              add_to != NULL  &&  add_to->string != NULL;
01977              add_to = add_to->next) {
01978             last = add_to;
01979         }
01980     } else {
01981         for (add_to = list;
01982              add_to != NULL
01983                &&  (add_to->string == NULL
01984                  ||  strcmp (string, add_to->string) != 0);
01985              add_to = add_to->next) {
01986             last = add_to;
01987         }
01988     }
01989     
01990     if (add_to == NULL) {
01991         add_to = s_StringCountNew (list);
01992         if (list == NULL) list = add_to;
01993         if (add_to != NULL) {
01994             add_to->string = string;
01995         }
01996     }
01997     if (add_to != NULL) {
01998         add_to->num_appearances ++;
01999         new_offset = s_IntLinkNew (line_num, add_to->line_numbers);
02000         if (add_to->line_numbers == NULL) {
02001             add_to->line_numbers = new_offset;
02002         }
02003     }
02004     return list;   
02005 }
02006 
02007 /* The following functions are replacements for strncasecmp and strcasecmp */
02008 
02009 /* This function returns -1 if str1 is less than str2 in the first cmp_count
02010  * characters (using case-insensitive comparisons), 0 if they are equal,
02011  * and 1 if str1 is greater than str2.
02012  */
02013 static int s_StringNICmp (char * str1, char *str2, int cmp_count)
02014 {
02015     char * cp1;
02016     char * cp2;
02017     int    char_count, diff;
02018 
02019     if (str1 == NULL && str2 == NULL) {
02020         return 0;
02021     }
02022     if (str1 == NULL) {
02023         return -1;
02024     }
02025     if (str2 == NULL) {
02026         return 1;
02027     }
02028     cp1 = str1;
02029     cp2 = str2;
02030     char_count = 0;
02031     while (*cp1 != 0  &&  *cp2 != 0  &&  char_count < cmp_count) {
02032         diff = toupper ((unsigned char)(*cp1)) - toupper ((unsigned char)(*cp2));
02033         if (diff != 0) {
02034             return diff;
02035         }
02036         char_count ++;
02037         cp1++;
02038         cp2++;
02039     }
02040     if (char_count == cmp_count) {
02041         return 0;
02042     } else if (*cp1 == 0  &&  *cp2 != 0) {
02043         return -1;
02044     } else if (*cp1 != 0  && *cp2 == 0) {
02045         return 1;
02046     } else {
02047         return 0;
02048     }
02049 }
02050 
02051 
02052 /* This function returns -1 if str1 is less than str2 using case-insensitive
02053  * comparisons), 0 if they are equal, and 1 if str1 is greater than str2.
02054  */
02055 static int s_StringICmp (char * str1, char *str2)
02056 {
02057     char * cp1;
02058     char * cp2;
02059     int    diff;
02060 
02061     if (str1 == NULL && str2 == NULL) {
02062         return 0;
02063     }
02064     if (str1 == NULL) {
02065         return -1;
02066     }
02067     if (str2 == NULL) {
02068         return 1;
02069     }
02070     cp1 = str1;
02071     cp2 = str2;
02072     while (*cp1 != 0  &&  *cp2 != 0) {
02073         diff = toupper ((unsigned char) *cp1) - toupper ((unsigned char) *cp2);
02074         if (diff != 0) {
02075             return diff;
02076         }
02077         cp1++;
02078         cp2++;
02079     }
02080     if (*cp1 == 0  &&  *cp2 != 0) {
02081         return -1;
02082     } else if (*cp1 != 0  && *cp2 == 0) {
02083         return 1;
02084     } else {
02085         return 0;
02086     }
02087 }
02088 
02089 
02090 /* The following functions are used to analyze specific kinds of lines
02091  * found in alignment files for information regarding the number of
02092  * expected sequences, the expected length of those sequences, and the
02093  * characters used to indicate missing, gap, and match characters.
02094  */
02095 
02096 /* This function reads two numbers separated by whitespace from the
02097  * beginning of the string and uses them to set the expected number of
02098  * sequences and the expected number of characters per sequence.
02099  */
02100 static void
02101 s_GetFASTAExpectedNumbers
02102 (char *           str,
02103  SAlignRawFilePtr afrp)
02104 {
02105     char *  cp;
02106     char *  cpend;
02107     char    replace;
02108     int     first, second;
02109 
02110     if (str == NULL  ||  afrp == NULL) {
02111         return;
02112     }
02113     cp = str;
02114     while (! isdigit ((unsigned char)*cp)  &&  *cp != 0) {
02115         cp++;
02116     }
02117 
02118     cpend = cp;
02119     while (isdigit ((unsigned char)*cpend)  &&  *cpend != 0) {
02120         cpend++;
02121     }
02122     if (cp == cpend) {
02123         return;
02124     }
02125     replace = *cpend;
02126     *cpend = 0;
02127     first = atol (cp);
02128     *cpend = replace;
02129 
02130     cp = cpend;
02131     while (! isdigit ((unsigned char)*cp)  &&  *cp != 0) {
02132         cp++;
02133     }
02134 
02135     cpend = cp;
02136     while (isdigit ((unsigned char)*cpend)  &&  *cpend != 0) {
02137         cpend++;
02138     }
02139     if (cp == cpend) {
02140         return;
02141     }
02142     replace = *cpend;
02143     *cpend = 0;
02144     second = atol (cp);
02145     *cpend = replace;
02146 
02147     if (first > 0  &&  second > 0) {
02148         afrp->expected_num_sequence = first;
02149         afrp->expected_sequence_len = second;
02150     }
02151   
02152 }
02153 
02154 
02155 /* This function examines the string str to see if it begins with two
02156  * numbers separated by whitespace.  The function returns eTrue if so,
02157  * otherwise it returns eFalse.
02158  */
02159 static EBool s_IsTwoNumbersSeparatedBySpace (char * str)
02160 {
02161     char * cp;
02162     EBool  found_first_number = eFalse;
02163     EBool  found_dividing_space = eFalse;
02164     EBool  found_second_number = eFalse;
02165     EBool  found_second_number_end = eFalse;
02166 
02167     if (str == NULL) {
02168         return eFalse;
02169     }
02170     cp = str;
02171     while (*cp != 0) {
02172         if (! isdigit ((unsigned char)*cp)  &&  ! isspace ((unsigned char)*cp)) {
02173             return eFalse;
02174         }
02175         if (! found_first_number) {
02176             if (! isdigit ((unsigned char)*cp)) {
02177                 return eFalse;
02178             }
02179             found_first_number = eTrue;
02180         } else if (! found_dividing_space) {
02181             if ( isspace ((unsigned char) *cp)) {
02182                 found_dividing_space = eTrue;
02183             } else if ( ! isdigit ((unsigned char)*cp)) {
02184                 return eFalse;
02185             }
02186         } else if (! found_second_number) {
02187             if ( isdigit ((unsigned char)*cp)) {
02188                 found_second_number = eTrue;
02189             } else if (! isspace ((unsigned char) *cp)) {
02190                 return eFalse;
02191             }
02192         } else if (! found_second_number_end) {
02193             if ( isspace ((unsigned char) *cp)) {
02194                 found_second_number_end = eTrue;
02195             } else if (! isdigit ((unsigned char)*cp)) {
02196                 return eFalse;
02197             }
02198         } else if (! isspace ((unsigned char) *cp)) {
02199             return eFalse;
02200         }
02201         cp++;
02202     }
02203     if (found_second_number) {
02204         return eTrue;
02205     }
02206     return eFalse;
02207 }
02208 
02209 
02210 /* This function finds a value name in a string, looks for an equals sign
02211  * after the value name, and then looks for an integer value after the
02212  * equals sign.  If the integer value is found, the function copies the
02213  * integer value into the val location and returns eTrue, otherwise the
02214  * function returns eFalse.
02215  */
02216 static EBool 
02217 s_GetOneNexusSizeComment 
02218 (char * str,
02219  char * valname, 
02220  int  * val)
02221 {
02222     char   buf[MAX_PRINTED_INT_LEN_PLUS_ONE];
02223     char * cpstart;
02224     char * cpend;
02225     int    maxlen;
02226 
02227     if (str == NULL  ||  valname == NULL  ||  val == NULL) {
02228         return eFalse;
02229     }
02230 
02231     cpstart = strstr (str, valname);
02232     if (cpstart == NULL) {
02233         return eFalse;
02234     }
02235     cpstart += strlen (valname);
02236     while (*cpstart != 0  &&  isspace ((unsigned char)*cpstart)) {
02237         cpstart++;
02238     }
02239     if (*cpstart != '=') {
02240         return eFalse;
02241     }
02242     cpstart ++;
02243     while (*cpstart != 0  &&  isspace ((unsigned char)*cpstart)) {
02244         cpstart++;
02245     }
02246 
02247     if (! isdigit ((unsigned char)*cpstart)) {
02248         return eFalse;
02249     }
02250     cpend = cpstart + 1;
02251     while ( *cpend != 0  &&  isdigit ((unsigned char)*cpend)) {
02252         cpend ++;
02253     }
02254     maxlen = cpend - cpstart;
02255     if (maxlen > kMaxPrintedIntLen) maxlen = kMaxPrintedIntLen;
02256 
02257     strncpy (buf, cpstart, maxlen);
02258     buf [maxlen] = 0;
02259     *val = atoi (buf);
02260     return eTrue;
02261 }
02262 
02263 
02264 /* This function looks for Nexus-style comments to indicate the number of
02265  * sequences and the number of characters per sequence expected from this
02266  * alignment file.  If the function finds these comments, it returns eTrue,
02267  * otherwise it returns eFalse.
02268  */
02269 static void 
02270 s_GetNexusSizeComments 
02271 (char *           str,
02272  EBool *          found_ntax,
02273  EBool *          found_nchar,
02274  SAlignRawFilePtr afrp)
02275 {
02276     int  num_sequences;
02277     int  num_chars;
02278   
02279     if (str == NULL  ||  found_nchar == NULL  
02280         ||  found_ntax == NULL  ||  afrp == NULL) {
02281         return;
02282     }
02283     if (! *found_ntax  && 
02284         (s_GetOneNexusSizeComment (str, "ntax", &num_sequences)
02285         ||   s_GetOneNexusSizeComment (str, "NTAX", &num_sequences))) {
02286         afrp->expected_num_sequence = num_sequences;
02287         afrp->align_format_found = eTrue;
02288         *found_ntax = eTrue;
02289     }
02290     if (! *found_nchar  &&
02291         (s_GetOneNexusSizeComment (str, "nchar", &num_chars)
02292         ||  s_GetOneNexusSizeComment (str, "NCHAR", &num_chars))) {
02293         afrp->expected_sequence_len = num_chars;
02294         afrp->align_format_found = eTrue;
02295         *found_nchar = eTrue;
02296     }
02297 }
02298 
02299 
02300 /* This function looks for characters in Nexus-style comments to 
02301  * indicate values for specific kinds of characters (match, missing, gap...).
02302  * If the string str contains val_name followed by an equals sign, the function
02303  * will return the first non-whitespace character following the equals sign,
02304  * otherwise the function will return a 0.
02305  */
02306 static char GetNexusTypechar (char * str, char * val_name)
02307 {
02308     char * cp;
02309     char * cpend;
02310 
02311     if (str == NULL  ||  val_name == NULL) {
02312         return 0;
02313     }
02314     cpend = strstr (str, ";");
02315     if (cpend == NULL) {
02316         return 0;
02317     }
02318     cp = strstr (str, val_name);
02319     if (cp == NULL || cp > cpend) {
02320         return 0;
02321     }
02322     cp += strlen (val_name);
02323     while ( isspace ((unsigned char)*cp)) {
02324         cp ++;
02325     }
02326     if (*cp != '=') {
02327         return 0;
02328     }
02329     cp++;
02330     while ( isspace ((unsigned char)*cp) || *cp == '\'') {
02331         cp ++;
02332     }
02333     return *cp;
02334 }
02335 
02336 
02337 /* This function reads a Nexus-style comment line for the characters 
02338  * specified for missing, match, and gap and compares the characters from
02339  * the comment with the characters specified in sequence_info.  If any
02340  * discrepancies are found, the function reports the errors and returns eFalse,
02341  * otherwise the function returns eTrue.
02342  */ 
02343 static EBool s_CheckNexusCharInfo 
02344 (char *               str,
02345  TSequenceInfoPtr     sequence_info,
02346  FReportErrorFunction errfunc,
02347  void *              errdata)
02348 {
02349     char * cp;
02350     char   c;
02351 
02352     if (str == NULL  ||  sequence_info == NULL) {
02353         return eFalse;
02354     }
02355 
02356     cp = strstr (str, "format ");
02357     if (cp == NULL) {
02358         cp = strstr (str, "FORMAT ");
02359     }
02360     if (cp == NULL) {
02361         return eFalse;
02362     }
02363 
02364     if (errfunc == NULL) {
02365         return eTrue;
02366     }
02367 
02368     c = GetNexusTypechar (cp + 7, "missing");
02369     if (c == 0) {
02370         c = GetNexusTypechar (cp + 7, "MISSING");
02371     }
02372     if (c != 0  &&  sequence_info->missing != NULL
02373         &&  strchr (sequence_info->missing, c) == NULL)
02374     {
02375         s_ReportCharCommentError (sequence_info->missing, c, "MISSING",
02376                                 errfunc, errdata);
02377     }
02378  
02379     c = GetNexusTypechar (cp + 7, "gap");
02380     if (c == 0) {
02381         c = GetNexusTypechar (cp + 7, "GAP");
02382     }
02383     if (c != 0  &&  sequence_info->middle_gap != NULL
02384         &&  strchr (sequence_info->middle_gap, c) == NULL)
02385     {
02386         s_ReportCharCommentError (sequence_info->middle_gap, c, "GAP",
02387                                 errfunc, errdata);
02388     }
02389  
02390     c = GetNexusTypechar (cp + 7, "match");
02391     if (c == 0) {
02392         c = GetNexusTypechar (cp + 7, "MATCH");
02393     }
02394     if (c != 0  &&  sequence_info->match != NULL
02395         &&  strchr (sequence_info->match, c) == NULL)
02396     {
02397         s_ReportCharCommentError (sequence_info->match, c, "MATCH",
02398                                 errfunc, errdata);
02399     }
02400     return eTrue;
02401 } 
02402 
02403 
02404 static char * s_ReplaceNexusTypeChar (char *str, char c)
02405 {
02406     if (str == NULL
02407         || c != *str 
02408         || *(str + 1) != 0)
02409     {
02410         if (str != NULL)
02411         {
02412           free (str);
02413         }
02414         str = (char *)malloc (2 * sizeof (char));
02415         if (str != NULL)
02416         {
02417           str [0] = c;
02418           str [1] = 0;
02419         }
02420     }
02421     return str;
02422 }
02423 
02424 /* This function reads a Nexus-style comment line for the characters 
02425  * specified for missing, match, and gap and sets those values in sequence_info.
02426  * The function returns eTrue if a Nexus comment was found, eFalse otherwise.
02427  */ 
02428 static EBool s_UpdateNexusCharInfo 
02429 (char *               str,
02430  TSequenceInfoPtr     sequence_info)
02431 {
02432     char * cp;
02433     char   c;
02434 
02435     if (str == NULL  ||  sequence_info == NULL) {
02436         return eFalse;
02437     }
02438 
02439     cp = strstr (str, "format ");
02440     if (cp == NULL) {
02441         cp = strstr (str, "FORMAT ");
02442     }
02443     if (cp == NULL) {
02444         return eFalse;
02445     }
02446 
02447     c = GetNexusTypechar (cp + 7, "missing");
02448     if (c == 0) {
02449         c = GetNexusTypechar (cp + 7, "MISSING");
02450     }
02451     sequence_info->missing = s_ReplaceNexusTypeChar (sequence_info->missing, c);
02452     
02453     c = GetNexusTypechar (cp + 7, "gap");
02454     if (c == 0) {
02455         c = GetNexusTypechar (cp + 7, "GAP");
02456     }
02457     sequence_info->beginning_gap = s_ReplaceNexusTypeChar (sequence_info->beginning_gap, c);
02458     sequence_info->middle_gap = s_ReplaceNexusTypeChar (sequence_info->middle_gap, c);
02459     sequence_info->end_gap = s_ReplaceNexusTypeChar (sequence_info->end_gap, c);
02460  
02461     c = GetNexusTypechar (cp + 7, "match");
02462     if (c == 0) {
02463         c = GetNexusTypechar (cp + 7, "MATCH");
02464     }
02465     sequence_info->match = s_ReplaceNexusTypeChar (sequence_info->match, c);
02466 
02467     return eTrue;
02468 } 
02469 
02470 
02471 /* This function examines the string str to see if it consists entirely of
02472  * asterisks, colons, periods, and whitespace.  If so, this line is assumed
02473  * to be a Clustal-style consensus line and the function returns eTrue.
02474  * otherwise the function returns false;
02475  */
02476 static EBool s_IsConsensusLine (char * str)
02477 {
02478     if (str == NULL 
02479         ||  strspn (str, "*:. \t\r\n") < strlen (str)
02480         ||  (strchr (str, '*') == NULL 
02481              &&  strchr (str, ':') == NULL
02482              &&  strchr (str, '.') == NULL)) {
02483         return eFalse;
02484     } else {
02485         return eTrue;
02486     } 
02487 }
02488 
02489 
02490 /* This function identifies lines that begin with a NEXUS keyword and end
02491  * with a semicolon - they will not contain sequence data.  The function
02492  * returns eTrue if the line contains only a NEXUS comment, eFalse otherwise.
02493  */
02494 static EBool s_SkippableNexusComment (char *str)
02495 {
02496     char * last_semicolon;
02497 
02498     if (str == NULL) {
02499         return eFalse;
02500     }
02501     last_semicolon = strrchr (str, ';');
02502     if (last_semicolon == NULL
02503         ||  strspn (last_semicolon + 1, " \t\r") != strlen (last_semicolon + 1)
02504         ||  strchr (str, ';') != last_semicolon) {
02505         return eFalse;
02506     }
02507     if (s_StringNICmp (str, "format ", 7) == 0
02508         ||  s_StringNICmp (str, "dimensions ", 11) == 0
02509         ||  s_StringNICmp (str, "dimensions ", 11) == 0
02510         ||  s_StringNICmp (str, "options ", 8) == 0
02511         ||  s_StringNICmp (str, "begin characters", 16) == 0
02512         ||  s_StringNICmp (str, "begin data", 10) == 0) {
02513         return eTrue;
02514     } else {
02515         return eFalse;
02516     }
02517 }
02518 
02519 
02520 /* This function determines whether the contents of str are "skippable"
02521  * in that they do not contain sequence data and therefore should not be
02522  * considered part of any block patterns or sequence data.
02523  */
02524 static EBool s_SkippableString (char * str)
02525 {
02526     if (str == NULL
02527         ||  s_StringNICmp (str, "matrix", 6) == 0
02528         ||  s_StringNICmp (str, "#NEXUS", 6) == 0
02529         ||  s_StringNICmp (str, "CLUSTAL W", 8) == 0
02530         ||  s_SkippableNexusComment (str)
02531         ||  s_IsTwoNumbersSeparatedBySpace (str)
02532         ||  s_IsConsensusLine (str)
02533         ||  str [0] == ';') {
02534         return eTrue;
02535     } else {
02536         return eFalse;
02537     }
02538 }
02539 
02540 
02541 /* This function determines whether str contains a indication
02542  * that this is real alignment format (nexus, clustal, etc.)
02543  */
02544 static EBool s_IsAlnFormatString (char * str)
02545 {
02546     if (s_StringNICmp (str, "matrix", 6) == 0
02547         ||  s_StringNICmp (str, "#NEXUS", 6) == 0
02548         ||  s_StringNICmp (str, "CLUSTAL W", 8) == 0
02549         ||  s_SkippableNexusComment (str)
02550         ||  s_IsTwoNumbersSeparatedBySpace (str)
02551         ||  s_IsConsensusLine (str)) {
02552         return eTrue;
02553     } else {
02554         return eFalse;
02555     }
02556 }
02557 
02558 
02559 /* This function determines whether or not str contains a blank line.
02560  */
02561 static EBool s_IsBlank (char * str)
02562 {
02563     size_t len;
02564 
02565     if (str == NULL) {
02566         return eTrue;
02567     }
02568     len = strspn (str, " \t\r");
02569     if (len == strlen (str)) {
02570         return eTrue;
02571     }
02572     return eFalse;
02573 }
02574 
02575 
02576 /* This function determines whether or not linestring contains a line
02577  * indicating the end of sequence data (organism information and definition
02578  * lines may occur after this line).
02579  */
02580 static EBool s_FoundStopLine (char * linestring)
02581 {
02582     if (linestring == NULL) {
02583         return eFalse;
02584     }
02585     if (s_StringNICmp (linestring, "endblock", 8) == 0
02586         ||  s_StringNICmp (linestring, "end;", 4) == 0) {
02587         return eTrue;
02588     }
02589     return eFalse;
02590 }
02591 
02592 
02593 /* This function identifies the beginning line of an ASN.1 file, which
02594  * cannot be read by the alignment reader.
02595  */
02596 static EBool s_IsASN1 (char * linestring)
02597 {
02598     if (linestring != NULL  &&  strstr (linestring, "::=") != NULL) {
02599         return eTrue;
02600     } else {
02601         return eFalse;
02602     }
02603 }
02604 
02605 
02606 /* The following functions are used to locate and read comments enclosed
02607  * in brackets.  These comments sometimes include organism information.
02608  */
02609 
02610 /* This function frees memory associated with a SCommentLoc structure. */
02611 static void s_CommentLocFree (TCommentLocPtr clp)
02612 {
02613     if (clp == NULL) {
02614         return;
02615     }
02616     s_CommentLocFree (clp->next);
02617     free (clp);
02618 }
02619 
02620 
02621 /* This function finds the first comment enclosed in brackets and creates
02622  * a SCommentLoc structure to indicate the position of the comment
02623  * in the string.  The function returns a pointer to this structure if a
02624  * comment is found or a NULL if the string does not contain a bracketed 
02625  * comment.
02626  */
02627 static TCommentLocPtr s_FindComment (char * string)
02628 {
02629     char *         cp_start;
02630     char *         cp_end;
02631     TCommentLocPtr clp;
02632 
02633     if (string == NULL) {
02634         return NULL;
02635     }
02636     cp_start = strstr (string, "[");
02637     if (cp_start != NULL) {
02638         cp_end = strstr (cp_start, "]");
02639         if (cp_end != NULL) {
02640             clp = (TCommentLocPtr) malloc (sizeof (SCommentLoc));
02641             if (clp == NULL) {
02642                 return NULL;
02643             }
02644             clp->start = cp_start;
02645             clp->end = cp_end;
02646             clp->next = NULL;
02647             return clp;
02648         }
02649     }
02650     return NULL;
02651 }
02652 
02653 
02654 /* This function removes a comment from a line. */
02655 static void s_RemoveCommentFromLine (char * linestring)
02656 {
02657     TCommentLocPtr clp;
02658 
02659     if (linestring == NULL) {
02660         return;
02661     }
02662 
02663     clp = s_FindComment (linestring);
02664     while (clp != NULL) {
02665         strcpy (clp->start, clp->end + 1);
02666         s_CommentLocFree (clp);
02667         clp = s_FindComment (linestring);
02668     }
02669 
02670     /* if we have read an organism comment and that's all there was on the
02671      * line, get rid of the arrow character as well so it doesn't end up 
02672      * in the sequence data
02673      */
02674     if ( linestring [0] == '>'  &&  linestring [1] == 0) {
02675         linestring [0] = 0;
02676     }
02677 
02678     /* if the line now contains only space, truncate it */
02679     if (strspn (linestring, " \t\r") == strlen (linestring)) {
02680         linestring [0] = 0;
02681     }
02682     
02683 }
02684 
02685 
02686 /* This function determines whether or not a comment describes an organism
02687  * by looking for org= or organism= inside the brackets.
02688  */
02689 static EBool s_IsOrganismComment (TCommentLocPtr clp)
02690 {
02691     int    len;
02692     char * cp;
02693     char * cp_end;
02694 
02695     if (clp == NULL  ||  clp->start == NULL  ||  clp->end == NULL) {
02696         return eFalse;
02697     }
02698  
02699     cp = clp->start;
02700     if (*cp != '[') {
02701         return eFalse;
02702     }
02703     cp ++;
02704     len = strspn ( clp->start, " \t\r");
02705     cp = cp + len;
02706     cp_end = strstr (cp, "=");
02707     if (cp_end == NULL) {
02708         return eFalse;
02709     }
02710     cp_end --;
02711     while (cp_end > cp  &&  isspace ((unsigned char)*cp_end)) {
02712       cp_end --;
02713     }
02714     cp_end ++;
02715     if ((cp_end - cp == 3  &&  s_StringNICmp (cp, "org", 3) == 0)
02716         ||  (cp_end - cp == 8  &&  s_StringNICmp (cp, "organism", 8) == 0)) {
02717         return eTrue;
02718     }
02719     return eFalse;
02720 }
02721 
02722 
02723 /* This function finds an organism comment, which includes the first bracketed
02724  * comment with org= or organism=, plus any additional bracketed comments
02725  * separated only by whitespace from the org= or organism= comment.
02726  * The function returns a pointer to a SCommentLoc structure describing
02727  * the location of the organism comment.
02728  */
02729 static TCommentLocPtr s_FindOrganismComment (char * string)
02730 {
02731     TCommentLocPtr clp, next_clp;
02732 
02733     if (string == NULL) {
02734         return NULL;
02735     }
02736 
02737     clp = s_FindComment (string);
02738     while (clp != NULL  &&  ! s_IsOrganismComment (clp)) {
02739         clp = s_FindComment (clp->end);
02740     }
02741 
02742     if (clp == NULL) {
02743         return NULL;
02744     }
02745 
02746     next_clp = s_FindComment (clp->end);
02747     while (next_clp != NULL  && 
02748         (int) strspn (clp->end + 1, " \t\r") == next_clp->start - clp->end - 1
02749         &&  ! s_IsOrganismComment (next_clp))
02750     {
02751         clp->end = next_clp->end;
02752         next_clp = s_FindComment (clp->end);
02753     }
02754     return clp;
02755 }
02756 
02757 
02758 /* This function removes an organism comment from a line. */
02759 static void s_RemoveOrganismCommentFromLine (char * string)
02760 {
02761     TCommentLocPtr clp;
02762 
02763     while ((clp = s_FindOrganismComment (string)) != NULL) {
02764         strcpy (clp->start, clp->end + 1);
02765         s_CommentLocFree (clp);
02766     }
02767 }
02768 
02769  
02770 /* This function creates an ordered list of comments within an organism
02771  * comment and returns a pointer to the first item in the linked list.
02772  * In an ordered org name, the org= value appears first, followed by other
02773  * bracketed values in alphabetical order.
02774  */
02775 static TCommentLocPtr s_CreateOrderedOrgCommentList (TCommentLocPtr org_clp)
02776 {
02777     TCommentLocPtr clp, prev_clp, next_clp, clp_list, ordered_start;
02778     int           next_len, this_len, len;
02779   
02780     if (org_clp == NULL) {
02781         return NULL;
02782     }
02783 
02784     clp_list = s_FindComment (org_clp->start); /* this is the org= */
02785     prev_clp = NULL;
02786     ordered_start = s_FindComment (clp_list->end);
02787     if (s_IsOrganismComment (ordered_start))
02788     {
02789       s_CommentLocFree (ordered_start);
02790       ordered_start = NULL;
02791     }
02792     if (ordered_start == NULL) {
02793         return clp_list;
02794     }
02795     clp = s_FindComment (ordered_start->end);
02796     while (clp != NULL  &&  clp->start < org_clp->end) {
02797         /* insert new comment into list */
02798         prev_clp = NULL;
02799         next_clp = ordered_start;
02800         next_len = next_clp->end - next_clp->start;
02801         this_len = clp->end - clp->start;
02802         len = next_len > this_len ? next_len : this_len;
02803         while (next_clp != NULL
02804           &&  strncmp (next_clp->start, clp->start, len) < 0)
02805         {
02806             prev_clp = next_clp;
02807             next_clp = next_clp->next;
02808             if (next_clp != NULL) {
02809                 next_len = next_clp->end - next_clp->start;
02810                 len = next_len > this_len ? next_len : this_len;
02811             }
02812         }
02813         if (prev_clp == NULL) {
02814             clp->next = ordered_start;
02815             ordered_start = clp;
02816         } else {
02817             clp->next = prev_clp->next;
02818             prev_clp->next = clp;
02819         }
02820         clp = s_FindComment (clp->end);
02821     }
02822     clp_list->next = ordered_start;
02823     return clp_list;
02824 }
02825 
02826 
02827 /* This function creates an ordered organism name based on the bracketed
02828  * comments contained in the location described by org_clp.
02829  */
02830 static char * s_CreateOrderedOrgName (TCommentLocPtr org_clp)
02831 {
02832     TCommentLocPtr clp, clp_list;
02833     char *         ordered_org_name;
02834     char *         cp;
02835 
02836     if (org_clp == NULL) {
02837         return NULL;
02838     }
02839 
02840     ordered_org_name = (char *)malloc (org_clp->end - org_clp->start + 2);
02841     if (ordered_org_name == NULL) {
02842         return NULL;
02843     }
02844     ordered_org_name [0] = 0;
02845     clp_list = s_CreateOrderedOrgCommentList (org_clp);
02846     cp = ordered_org_name;
02847     for (clp = clp_list; clp != NULL; clp = clp->next) {
02848         strncpy (cp, clp->start, clp->end - clp->start + 1);
02849         cp += clp->end - clp->start + 1;
02850         *cp = 0;
02851     }
02852     
02853     s_CommentLocFree (clp_list);
02854 
02855     return ordered_org_name;
02856 }
02857 
02858 static void s_AddDeflineFromOrganismLine 
02859 (char             *defline, 
02860  int              line_num,
02861  int              defline_offset,
02862  SAlignRawFilePtr afrp)
02863 {
02864     TLineInfoPtr lip;
02865     int          org_num, defline_num, new_len;
02866     char         *empty_defline, *new_defline;
02867     
02868     if (afrp == NULL || defline == NULL) {
02869         return;
02870     }
02871     
02872     /* make sure that we are adding the definition line to the correct position
02873      * in the list - should match last organism name */
02874     lip = afrp->organisms;
02875     org_num = 0;
02876     while (lip != NULL)
02877     {
02878         org_num++;
02879         lip = lip->next;
02880     }
02881     
02882     lip = afrp->deflines;
02883     defline_num = 0;
02884     while (lip != NULL  &&  defline_num < org_num) {
02885         lip = lip->next;
02886         defline_num ++;
02887     }
02888     
02889     if (defline_num == org_num) {
02890         /* if previous defline is empty, replace with new defline */
02891         if (strlen (lip->data) == 0)
02892         {
02893             free (lip->data);
02894             lip->data = defline;
02895         }
02896         else
02897         {
02898             /* append defline to the end of the existing entry */
02899             new_len = strlen (lip->data) + strlen (defline) + 2;
02900             new_defline = (char *) malloc (new_len * sizeof (char));
02901             if (new_defline != NULL)
02902             {
02903                 strcpy (new_defline, lip->data);
02904                 strcat (new_defline, " ");
02905                 strcat (new_defline, defline);
02906                 free (lip->data);
02907                 lip->data = new_defline;
02908                 free (defline);
02909                 defline = NULL;
02910             }
02911         }
02912         /* use new line numbers */
02913         lip->line_num = line_num;
02914         lip->line_offset = defline_offset;
02915         lip->delete_me = eFalse;        
02916     }
02917     else
02918     {
02919         /* add empty deflines to get to the correct position */
02920         while (defline_num < org_num - 1)
02921         {
02922             empty_defline = (char *) malloc (sizeof (char));
02923             if (empty_defline != NULL)
02924             {
02925                 *empty_defline = 0;
02926                 afrp->deflines = s_AddLineInfo (afrp->deflines, 
02927                                                 empty_defline, 0,
02928                                                 0);
02929                 afrp->num_deflines ++;
02930             }
02931             defline_num++;
02932         }
02933         /* now add new defline in correct position */
02934         afrp->deflines = s_AddLineInfo (afrp->deflines, defline, 
02935                                         line_num, defline_offset);
02936         afrp->num_deflines ++;
02937     }
02938 }
02939 
02940 /* This function is used to read any organism names that may appear in
02941  * string, including any modifiers that may appear after the organism name.
02942  */
02943 static void s_ReadOrgNamesFromText 
02944 (char *           string,
02945  int              line_num,
02946  SAlignRawFilePtr afrp)
02947 {
02948     TCommentLocPtr clp;
02949     char *         org_name;
02950     char *         cp;
02951     char *         defline;
02952     char *         comment_end;
02953     int            defline_offset;
02954   
02955     if (string == NULL  ||  afrp == NULL) {
02956         return;
02957     }
02958 
02959     clp = s_FindOrganismComment (string);
02960     if (clp == NULL && (strstr (string, "org=") != NULL || strstr (string, "organism=") != NULL))
02961     {
02962       s_ReportOrgCommentError (string, afrp->report_error, afrp->report_error_userdata);
02963     }
02964     while (clp != NULL) {
02965         org_name = s_CreateOrderedOrgName (clp);
02966         afrp->organisms = s_AddLineInfo (afrp->organisms, org_name, line_num,
02967                                        clp->start - string);
02968         free (org_name);
02969         afrp->num_organisms ++;
02970         defline = NULL;
02971         defline_offset = 0;
02972         if (*clp->end != 0) {
02973             cp = clp->end + 1;
02974             cp += strspn (cp, " \t\r\n");
02975             if (*cp != 0) {
02976                 defline = clp->end + 1;
02977                 defline_offset = clp->end - string + 1;
02978             }
02979         }
02980         s_AddDeflineFromOrganismLine (defline, line_num, defline_offset, afrp);
02981                                       
02982         comment_end = clp->end;
02983         s_CommentLocFree (clp);
02984         clp = s_FindOrganismComment (comment_end);
02985     }
02986 }
02987 
02988 
02989 /* The following group of functions manages the SAlignRawSeq structure,
02990  * which is used to track the IDs of sequences in the file, the sequence
02991  * characters for those IDs, and the locations of the IDs and sequence
02992  * characters.
02993  */
02994 
02995 /* This function allocates memory for an SAlignRawSeq structure,
02996  * initializes its member variables, and returns a pointer to the newly
02997  * allocated structure.
02998  */
02999 static TAlignRawSeqPtr s_AlignRawSeqNew (TAlignRawSeqPtr list)
03000 {
03001     TAlignRawSeqPtr arsp, last;
03002 
03003     arsp = (TAlignRawSeqPtr)malloc (sizeof (SAlignRawSeq));
03004     if (arsp == NULL) {
03005         return NULL;
03006     }
03007     arsp->id            = NULL;
03008     arsp->sequence_data = NULL;
03009     arsp->id_lines      = NULL;
03010     arsp->next          = NULL;
03011 
03012     last = list;
03013     while (last != NULL && last->next != NULL) {
03014         last = last->next;
03015     }
03016     if (last != NULL) {
03017         last->next = arsp;
03018     }
03019     return arsp;
03020 }
03021 
03022 
03023 /* This function frees the memory associated with an SAlignRawSeq
03024  * structure's member variables and with the structure itself.
03025  */
03026 static void s_AlignRawSeqFree (TAlignRawSeqPtr arsp)
03027 {
03028     if (arsp == NULL) {
03029         return;
03030     }
03031     s_AlignRawSeqFree (arsp->next);
03032     free (arsp->id);
03033     s_LineInfoFree (arsp->sequence_data);
03034     s_IntLinkFree (arsp->id_lines);
03035     free (arsp);
03036 }
03037 
03038 
03039 /* This function returns a pointer to the sequence in list with the specified
03040  * ID, unless there is no such sequence, in which case the function returns
03041  * NULL.
03042  */
03043 static TAlignRawSeqPtr 
03044 s_FindAlignRawSeqById 
03045 (TAlignRawSeqPtr list,
03046  char *          id)
03047 {
03048     TAlignRawSeqPtr arsp;
03049 
03050     for (arsp = list; arsp != NULL; arsp = arsp->next) {
03051         if (strcmp (arsp->id, id) == 0) {
03052             return arsp;
03053         }
03054     }
03055     return NULL;
03056 }
03057 
03058 
03059 /* This function finds the position of a given ID in the sequence list,
03060  * unless the ID is not found in the list, in which case the function returns
03061  * -1.
03062  */
03063 static int  
03064 s_FindAlignRawSeqOffsetById 
03065 (TAlignRawSeqPtr list, 
03066  char *          id)
03067 {
03068     TAlignRawSeqPtr arsp;
03069     int             offset;
03070 
03071     for (arsp = list, offset = 0; arsp != NULL; arsp = arsp->next, offset++) {
03072         if (strcmp (arsp->id, id) == 0) {
03073             return offset;
03074         }
03075     }
03076     return -1;
03077 }
03078 
03079 
03080 /* This function returns a pointer to the memory in which the ID for the
03081  * Nth sequence is stored, unless there aren't that many sequences, in which
03082  * case NULL is returned.
03083  */
03084 static char * 
03085 s_GetAlignRawSeqIDByOffset 
03086 (TAlignRawSeqPtr list,
03087  int  offset)
03088 {
03089     TAlignRawSeqPtr arsp;
03090     int            index;
03091 
03092     arsp = list;
03093     index = 0;
03094     while ( arsp != NULL  &&  index != offset ) {
03095         arsp = arsp->next;
03096         index++;
03097     }
03098     if (index == offset  &&  arsp != NULL) {
03099         return arsp->id;
03100     } else {
03101         return NULL;
03102     }
03103 }
03104 
03105 
03106 /* This function adds data to a sequence by looking for the specified ID in
03107  * the list.  If the id is not found, a new sequence with that ID is added to
03108  * the end of the list.
03109  * The function returns a pointer to the first item in the list.
03110  */
03111 static TAlignRawSeqPtr
03112 s_AddAlignRawSeqById
03113 (TAlignRawSeqPtr list,
03114  char *  id,
03115  char *  data,
03116  int     id_line_num,
03117  int     data_line_num,
03118  int     data_line_offset)
03119 {
03120     TAlignRawSeqPtr arsp;
03121     TIntLinkPtr     ilp;
03122 
03123     arsp = s_FindAlignRawSeqById (list, id);
03124     if (arsp == NULL) {
03125         arsp = s_AlignRawSeqNew (list);
03126         if (arsp == NULL) {
03127             return NULL;
03128         }
03129         if (list == NULL) list = arsp;
03130         arsp->id = strdup (id);
03131     }
03132     arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
03133                                        data,
03134                                        data_line_num,
03135                                        data_line_offset);
03136     ilp = s_IntLinkNew (id_line_num, arsp->id_lines);
03137     if (arsp->id_lines == NULL) arsp->id_lines = ilp;
03138     return list;
03139 }
03140 
03141 
03142 /* This function adds data to the Nth sequence in the sequence list and
03143  * returns eTrue, unless there aren't that many sequences in the list, in
03144  * which case the function returns eFalse.
03145  */
03146 static EBool 
03147 s_AddAlignRawSeqByIndex 
03148 (TAlignRawSeqPtr list,
03149  int     index,
03150  char *  data,
03151  int     data_line_num,
03152  int     data_line_offset)
03153 {
03154     TAlignRawSeqPtr arsp;
03155     int            curr;
03156 
03157     curr = 0;
03158     for (arsp = list; arsp != NULL  &&  curr < index; arsp = arsp->next) {
03159         curr++;
03160     }
03161     if (arsp == NULL) {
03162         return eFalse;
03163     } else {
03164         arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
03165                                            data,
03166                                            data_line_num,
03167                                            data_line_offset);
03168         return eTrue;
03169     }
03170 }
03171 
03172 
03173 /* This function frees memory associated with the SAlignRawFileData structure.
03174  */
03175 static void s_AlignFileRawFree (SAlignRawFilePtr afrp)
03176 {
03177     if (afrp == NULL) {
03178         return;
03179     }
03180 
03181     s_LineInfoFree (afrp->organisms);
03182     s_LineInfoFree (afrp->deflines);
03183     s_LineInfoFree (afrp->line_list);
03184     s_AlignRawSeqFree (afrp->sequences);
03185     s_IntLinkFree (afrp->offset_list);
03186     free (afrp->alphabet);
03187     free (afrp);
03188 }
03189 
03190 
03191 /* This function allocates memory for an SAlignRawFileData structure and
03192  * initializes its member variables.  The function returns a pointer to
03193  * the newly allocated structure.
03194  */
03195 static SAlignRawFilePtr s_AlignFileRawNew (void)
03196 {
03197     SAlignRawFilePtr afrp;
03198 
03199     afrp = (SAlignRawFilePtr)malloc (sizeof (SAlignRawFileData));
03200     if (afrp == NULL) {
03201         return NULL;
03202     }
03203     afrp->marked_ids            = eFalse;
03204     afrp->line_list             = NULL;
03205     afrp->organisms             = NULL;
03206     afrp->num_organisms         = 0;
03207     afrp->deflines              = NULL;
03208     afrp->num_deflines          = 0;
03209     afrp->block_size            = 0;
03210     afrp->offset_list           = NULL;
03211     afrp->sequences             = NULL;
03212     afrp->report_error          = NULL;
03213     afrp->report_error_userdata = NULL;
03214     afrp->alphabet              = NULL;
03215     afrp->expected_num_sequence = 0;
03216     afrp->expected_sequence_len = 0;
03217     afrp->num_segments          = 1;
03218     afrp->align_format_found    = eFalse;
03219     return afrp;
03220 }
03221 
03222 
03223 /* The following functions are used to analyze the structure of a file and
03224  * assemble the sequences listed in the file.
03225  * Sequence data in a file is organized in one of two general formats - 
03226  * interleaved or contiguous.  Interleaved data can be recognized by looking
03227  * for repeated blocks of the same number of lines within a file separated
03228  * by blank or skippable lines from other lines in the file.  The first of
03229  * these blocks must have at least two elements separated by whitespace
03230  * in each line, the first of these elements is the ID for the sequence in
03231  * that row and for the sequences in that position within the block for the
03232  * remainder of the file.
03233  * Contiguous data can be recognized by either looking for "marked" sequence
03234  * IDs, which begin with a '>' character, or by looking for repeated patterns
03235  * of lines with the same numbers of characters.
03236  */
03237 
03238 /* The following functions are used to analyze interleaved data. */
03239 
03240 /* This function creates a SLengthListData structure that describes the pattern
03241  * of character lengths in the string pointed to by cp.
03242  */
03243 static SLengthListPtr s_GetBlockPattern (char * cp)
03244 {
03245     SLengthListPtr this_pattern;
03246     int           len;
03247 
03248     this_pattern = s_LengthListNew (NULL);
03249     if (this_pattern == NULL) {
03250         return NULL;
03251     }
03252 
03253     this_pattern->num_appearances = 1;
03254     while (*cp != 0) {
03255         len = strcspn (cp, " \t\r");
03256         s_AddLengthRepeat (this_pattern, len);
03257         cp += len;
03258         cp += strspn (cp, " \t\r");
03259     }
03260     return this_pattern;
03261 }
03262 
03263 
03264 /* This function attempts to predict whether the following lines will be
03265  * an interleaved block.  If so, the function returns the location of the
03266  * beginning of the block, otherwise the function returns -1.
03267  */
03268 static int 
03269 s_ForecastBlockPattern 
03270 (SLengthListPtr pattern_list,
03271  TIntLinkPtr    next_offset,
03272  int            line_start,
03273  int            block_size)
03274 {
03275     int  line_counter;
03276     SLengthListPtr llp;
03277 
03278     line_counter = line_start;
03279     if (next_offset != NULL
03280         &&  next_offset->ival - line_counter < block_size) {
03281         return -1;
03282     }
03283     
03284     for (llp = pattern_list;
03285          llp != NULL
03286            &&  (next_offset == NULL  ||  line_counter < next_offset->ival - 1)
03287            &&  line_counter - line_start < block_size;
03288          llp = llp->next)
03289     {
03290         if (llp->lengthrepeats == NULL) {
03291             return -1;
03292         }
03293         line_counter += llp->num_appearances;
03294     }
03295     if (line_counter - line_start == block_size) {
03296         /* we've found a combination of groups of similarly sized lines
03297          * that add up to the desired block size - is the next line blank,
03298          * or are there additional non-blank lines?
03299          */
03300         if (llp == NULL /* The block ended with the last line in the file */
03301             || llp->lengthrepeats == NULL) { /* or the next line is blank */
03302             return line_start;
03303         }
03304     }
03305     return -1;
03306 }
03307 
03308 
03309 /* This function looks for malformed blocks between the identified blocks
03310  * indicated by the offset_list.  It returns a pointer to the list with the
03311  * new locations inserted at the appropriate locations.
03312  */
03313 static TIntLinkPtr
03314 s_AugmentBlockPatternOffsetList
03315 (SLengthListPtr pattern_list,
03316  TIntLinkPtr    offset_list,
03317  int            block_size)
03318 {
03319     int            line_counter;
03320     SLengthListPtr llp;
03321     TIntLinkPtr    next_offset, prev_offset, new_offset;
03322     int            forecast_pos;
03323 
03324     prev_offset = NULL;
03325     next_offset = offset_list;
03326     line_counter = 0;
03327     llp = pattern_list;
03328     while (llp != NULL) {
03329         if (next_offset != NULL  &&  line_counter == next_offset->ival) {
03330             prev_offset = next_offset;
03331             next_offset = next_offset->next;
03332             /* skip past the lines for this block */
03333             while (line_counter - prev_offset->ival < block_size
03334                    &&  llp != NULL)
03335             {
03336                 line_counter += llp->num_appearances;
03337                 llp = llp->next;
03338             }
03339         } else {
03340             forecast_pos = s_ForecastBlockPattern (llp, next_offset,
03341                                                  line_counter,
03342                                                  block_size);
03343             if (forecast_pos > 0) {
03344                 new_offset = s_IntLinkNew (forecast_pos, NULL);
03345                 if (new_offset == NULL) {
03346                     return NULL;
03347                 }
03348                 if (prev_offset == NULL) {
03349                     new_offset->next = offset_list;
03350                     offset_list = new_offset;
03351                 } else {
03352                     new_offset->next = next_offset;
03353                     prev_offset->next = new_offset;
03354                 }
03355                 prev_offset = new_offset;
03356                 /* skip past the lines for this block */
03357                 while (line_counter - prev_offset->ival < block_size
03358                        &&  llp != NULL)
03359                 {
03360                     line_counter += llp->num_appearances;
03361                     llp = llp->next;
03362                 }
03363             } else {
03364                 line_counter += llp->num_appearances;
03365                 llp = llp->next;
03366             }
03367         }
03368     }
03369     return offset_list;
03370 }
03371 
03372 
03373 /* This function looks for lines that could not be assigned to an interleaved
03374  * block.  It returns eTrue if it finds any such lines after the first offset,
03375  * eFalse otherwise, and reports all instances of unused lines as errors.
03376  */
03377 static EBool
03378 s_FindUnusedLines 
03379 (SLengthListPtr pattern_list,
03380  SAlignRawFilePtr afrp)
03381 {
03382     TIntLinkPtr    offset;
03383     SLengthListPtr llp;
03384     int            line_counter;
03385     int            block_line_counter;
03386     EBool          rval = eFalse;
03387     TLineInfoPtr   line_val;
03388     int            skip;
03389 
03390     if (pattern_list == NULL  ||  afrp == NULL
03391         ||  afrp->offset_list == NULL  ||  afrp->block_size < 2) {
03392         return eFalse;
03393     }
03394     
03395     offset = afrp->offset_list;
03396     llp = pattern_list;
03397     line_counter = 0;
03398     line_val = afrp->line_list;
03399  
03400     while (llp != NULL  &&  line_val != NULL) {
03401         while (llp != NULL  &&  line_val != NULL
03402                &&  (offset == NULL  ||  line_counter < offset->ival)) {
03403             if (llp->lengthrepeats != NULL) {
03404                 s_ReportUnusedLine (line_counter,
03405                                     line_counter + llp->num_appearances - 1,
03406                                     line_val,
03407                                     afrp->report_error,
03408                                     afrp->report_error_userdata);
03409                 if (offset != afrp->offset_list) {
03410                     rval = eTrue;
03411                 }
03412             }
03413             line_counter += llp->num_appearances;
03414             for (skip = 0;
03415                  skip < llp->num_appearances  &&  line_val != NULL;
03416                  skip++) {
03417                 line_val = line_val->next;
03418             }
03419             llp = llp->next;
03420         }
03421         block_line_counter = 0;
03422         while (block_line_counter < afrp->block_size  &&  llp != NULL) {
03423             block_line_counter += llp->num_appearances;
03424             line_counter += llp->num_appearances;
03425             for (skip = 0;
03426                  skip < llp->num_appearances  &&  line_val != NULL;
03427                  skip++) {
03428                 line_val = line_val->next;
03429             }
03430             llp = llp->next;
03431         }
03432         if (offset != NULL) {
03433             offset = offset->next;
03434         }
03435     }
03436     return rval;
03437 }
03438 
03439 
03440 /* This function examines a list of line lengths, looking for interleaved
03441  * blocks.  If it finds them, it will set the SAlignRawFileData offset_list
03442  * member variable to point to a list of locations for the blocks.
03443  */
03444 static void
03445 s_FindInterleavedBlocks 
03446 (SLengthListPtr pattern_list,
03447  SAlignRawFilePtr afrp)
03448 {
03449     SLengthListPtr llp, llp_next;
03450     TSizeInfoPtr   size_list, best_ptr;
03451     TIntLinkPtr    new_offset;
03452     int            line_counter;
03453 
03454     afrp->block_size = 0;
03455     size_list = NULL;
03456     afrp->offset_list = NULL;
03457     for (llp = pattern_list; llp != NULL; llp = llp->next) {
03458         llp_next = llp->next;
03459         if (llp->num_appearances > 1 
03460             &&  (llp_next == NULL  ||  llp_next->lengthrepeats == NULL)) {
03461             size_list = s_AddSizeInfo (size_list, llp->num_appearances);
03462         }
03463     }
03464     best_ptr = s_GetMostPopularSizeInfo (size_list);
03465     if (best_ptr != NULL  
03466         &&  (best_ptr->num_appearances > 1  ||  
03467              (size_list->next == NULL  &&  size_list->size_value > 1))) {
03468         afrp->block_size = best_ptr->size_value;
03469         line_counter = 0;
03470         for (llp = pattern_list; llp != NULL; llp = llp->next) {
03471             llp_next = llp->next;
03472             if (llp->num_appearances == afrp->block_size
03473                 &&  (llp_next == NULL  ||  llp_next->lengthrepeats == NULL))
03474             {
03475                 new_offset = s_IntLinkNew (line_counter, afrp->offset_list);
03476                 if (new_offset == NULL) {
03477                     return;
03478                 }
03479                 if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
03480             }
03481             line_counter += llp->num_appearances;
03482         }
03483         afrp->offset_list = s_AugmentBlockPatternOffsetList (pattern_list,
03484                                                            afrp->offset_list, 
03485                                                            afrp->block_size);
03486     }
03487     if (s_FindUnusedLines (pattern_list, afrp)) {
03488         s_IntLinkFree (afrp->offset_list);
03489         afrp->offset_list = NULL;
03490         afrp->block_size = 0;
03491     } else {
03492         afrp->align_format_found = eTrue;
03493     }
03494     s_SizeInfoFree (size_list);
03495     
03496 }
03497 
03498 static void s_TrimEndSpace (char *linestring)
03499 {
03500     int len;
03501     char *cp;
03502   
03503     if (linestring == NULL) return;
03504     len = strlen (linestring);
03505     cp = linestring + len - 1;
03506     while (cp > linestring && (*cp == ' ' || *cp == '\t' || *cp == '\r' || *cp == '\n'))
03507     {
03508         *cp = 0;
03509         cp--;
03510     }
03511 }
03512 
03513 static SAlignRawFilePtr
03514 s_ReadAlignFileRaw
03515 (FReadLineFunction    readfunc,
03516  void *               userdata,
03517  TSequenceInfoPtr     sequence_info,
03518  EBool                use_nexus_file_info,
03519  FReportErrorFunction errfunc,
03520  void *               errdata)
03521 {
03522     char *                   linestring;
03523     SAlignRawFilePtr         afrp;
03524     char *                   tmp;
03525     EBool                    found_stop;
03526     int                      overall_line_count;
03527     EBool                    found_expected_ntax = eFalse;
03528     EBool                    found_expected_nchar = eFalse;
03529     EBool                    found_char_comment = eFalse;
03530     SLengthListPtr           pattern_list = NULL;
03531     SLengthListPtr           this_pattern, last_pattern = NULL;
03532     char *                   cp;
03533     int                      len;
03534     TIntLinkPtr              new_offset;
03535     EBool                    in_taxa_comment;
03536     EBool                    in_bracketed_comment = eFalse;
03537     TBracketedCommentListPtr comment_list = NULL, last_comment = NULL;
03538     EBool                    last_line_was_marked_id = eFalse;
03539     TLineInfoPtr             last_line = NULL, next_line;
03540 
03541     if (readfunc == NULL  ||  sequence_info == NULL) {
03542         return NULL;
03543     }
03544 
03545     afrp = s_AlignFileRawNew ();
03546     if (afrp == NULL) {
03547         return NULL;
03548     }
03549   
03550     afrp->alphabet = strdup (sequence_info->alphabet);
03551     afrp->report_error = errfunc;
03552     afrp->report_error_userdata = errdata;
03553 
03554     overall_line_count = 0;
03555     found_stop = eFalse;
03556     in_taxa_comment = eFalse;
03557     linestring = readfunc (userdata);
03558     if (s_IsASN1 (linestring)) {
03559         s_ReportASN1Error (afrp->report_error, afrp->report_error_userdata);
03560         s_AlignFileRawFree (afrp);
03561         return NULL;
03562     }
03563 
03564     while (linestring != NULL  &&  linestring [0] != EOF) {
03565         s_TrimEndSpace (linestring);
03566         s_ReadOrgNamesFromText (linestring, overall_line_count, afrp);
03567         /* we want to remove the comment from the line for the purpose 
03568          * of looking for blank lines and skipping,
03569          * but save comments for storing in array if line is not skippable or
03570          * blank
03571          */ 
03572         len = strspn (linestring, " \t\r\n");
03573         tmp = strdup (linestring + len);
03574         if (tmp == NULL) {
03575             return NULL;
03576         }
03577  
03578         if (! found_stop && ! in_taxa_comment) {
03579             found_stop = s_FoundStopLine (tmp);
03580         }
03581         if (! found_stop) {
03582             if (! found_expected_ntax  ||  ! found_expected_nchar) {
03583                 if (s_IsTwoNumbersSeparatedBySpace (tmp)) {
03584                     s_GetFASTAExpectedNumbers (tmp, afrp);
03585                     found_expected_ntax = eTrue;
03586                     found_expected_nchar = eTrue;
03587                     afrp->align_format_found = eTrue;
03588                } else {
03589                     s_GetNexusSizeComments (tmp, &found_expected_ntax,
03590                                             &found_expected_nchar, afrp);
03591                 }
03592             }
03593             if (! found_char_comment) {
03594               if (use_nexus_file_info) {
03595                 found_char_comment = s_UpdateNexusCharInfo (tmp, sequence_info);
03596               } else {
03597                 found_char_comment = s_CheckNexusCharInfo (tmp, sequence_info, 
03598                                                            afrp->report_error,
03599                                                            afrp->report_error_userdata);
03600               }
03601             }
03602             if (in_taxa_comment) {
03603                 if (strncmp (tmp, "end;", 4) == 0) {
03604                     in_taxa_comment = eFalse;
03605                 } 
03606                 tmp [0] = 0;
03607             } else if (strncmp (tmp, "begin taxa;", 11) == 0) {
03608                 tmp [0] = 0;
03609                 in_taxa_comment = eTrue;
03610                 afrp->align_format_found = eTrue;
03611             }
03612 
03613             /* remove complete single-line bracketed comments from line 
03614              *before checking for multiline bracketed comments */
03615             s_RemoveCommentFromLine (tmp);
03616 
03617             if (in_bracketed_comment) {
03618                 len = strspn (linestring, " \t\r\n");
03619                 if (last_comment != NULL) 
03620                 {
03621                     s_BracketedCommentListAddLine (last_comment, linestring + len,
03622                                                    overall_line_count, len);
03623                 }
03624                 if (strchr (tmp, ']') != NULL) {
03625                     in_bracketed_comment = eFalse;
03626                 }
03627                 tmp [0] = 0;
03628             } else if (tmp [0] == '[' && strchr (tmp, ']') == NULL) {
03629                 in_bracketed_comment = eTrue;
03630                 len = strspn (linestring, " \t\r\n");
03631                 last_comment = s_BracketedCommentListNew (comment_list,
03632                                                           linestring + len,
03633                                                           overall_line_count, len);
03634                 if (comment_list == NULL) 
03635                 {
03636                     comment_list = last_comment;
03637                 }
03638                 tmp [0] = 0;
03639             }
03640 
03641             if (!afrp->align_format_found  && s_IsAlnFormatString (tmp)) {
03642                 afrp->align_format_found = eTrue;
03643             }                  
03644 
03645             if (s_SkippableString (tmp)) {
03646                 tmp [0] = 0;
03647             }
03648             if (tmp [0] == '>'  &&  ! found_stop) {
03649                 /* this could be a block of organism lines in a
03650                  * NEXUS file.  If there is no sequence data between
03651                  * the lines, don't process this file for marked IDs.
03652                  */
03653                 if (last_line_was_marked_id)
03654                 {
03655                     afrp->marked_ids = eFalse;
03656                 }
03657                 else
03658                 {
03659                     afrp->marked_ids = eTrue;
03660                 }
03661                 new_offset = s_IntLinkNew (overall_line_count + 1,
03662                                           afrp->offset_list);
03663                 if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
03664                 last_line_was_marked_id = eTrue;
03665             } else {
03666                 last_line_was_marked_id = eFalse;
03667                 /* add to length list for interleaved block search */
03668                 len = strcspn (tmp, " \t\r");
03669                 if (len > 0) {
03670                     cp = tmp + len;
03671                     len = strspn (cp, " \t\r");
03672                     if (len > 0) {
03673                         cp = cp + len;
03674                     }
03675                     if (*cp == 0) {
03676                         this_pattern = s_GetBlockPattern (tmp);
03677                     } else {
03678                         this_pattern = s_GetBlockPattern (cp);
03679                     }                    
03680                 } else {
03681                     this_pattern = s_GetBlockPattern (tmp);
03682                 }
03683                 
03684                 if (last_pattern == NULL) {
03685                     pattern_list = this_pattern;
03686                     last_pattern = this_pattern;
03687                 } else if (s_DoLengthPatternsMatch (last_pattern, this_pattern)) {
03688                     last_pattern->num_appearances ++;
03689                     s_LengthListFree (this_pattern);
03690                 } else {
03691                     last_pattern->next = this_pattern;
03692                     last_pattern = this_pattern;
03693                 }
03694             }
03695 
03696             len = strspn (linestring, " \t\r\n");
03697 
03698             next_line = s_LineInfoNew (linestring + len, overall_line_count, len);
03699 
03700             if (last_line == NULL) {
03701                 afrp->line_list = next_line;
03702             } else {
03703                 last_line->next = next_line;
03704             }
03705             last_line = next_line;
03706         }
03707         free (linestring);
03708         free (tmp);
03709         linestring = readfunc (userdata);
03710         overall_line_count ++;
03711     }
03712     afrp->num_segments = s_GetNumSegmentsInAlignment (comment_list, errfunc, errdata);
03713     if (afrp->num_segments > 1) 
03714     {
03715         if (afrp->offset_list != NULL)
03716         {
03717             s_ReportSegmentedAlignmentError (afrp->offset_list,
03718                                              errfunc, errdata);
03719             s_AlignFileRawFree (afrp);
03720             s_LengthListFree (pattern_list);
03721             s_BracketedCommentListFree (comment_list);
03722             return NULL;            
03723         }
03724         else
03725         {
03726             afrp->offset_list = GetSegmentOffsetList (comment_list);
03727             afrp->marked_ids = eTrue;
03728         }
03729     }
03730     if (! afrp->marked_ids) {
03731         s_FindInterleavedBlocks (pattern_list, afrp);
03732     }
03733     s_LengthListFree (pattern_list);
03734     s_BracketedCommentListFree (comment_list);
03735     return afrp;
03736 }
03737 
03738 
03739 /* This function analyzes a block to see if it contains, as the first element
03740  * of any of its lines, one of the sequence IDs already identified.  If the
03741  * one of the lines does begin with a sequence ID, all of the lines are
03742  * assumed to begin with sequence IDs and the function returns eTrue, otherwise
03743  * the function returns eFalse.
03744  */
03745 static EBool 
03746 s_DoesBlockHaveIds 
03747 (SAlignRawFilePtr afrp, 
03748  TLineInfoPtr     first_line,
03749  int             num_lines_in_block)
03750 {
03751     TLineInfoPtr    lip;
03752     char *          linestring;
03753     char *          this_id;
03754     TAlignRawSeqPtr arsp;
03755     size_t          len;
03756     int             block_offset;
03757 
03758     if (afrp->sequences == NULL) {
03759          return eTrue;
03760     }
03761 
03762     for (lip = first_line, block_offset = 0;
03763          lip != NULL  &&  block_offset < num_lines_in_block;
03764          lip = lip->next, block_offset++)
03765     {
03766         linestring = lip->data;
03767         if (linestring != NULL) {
03768             len = strcspn (linestring, " \t\r");
03769             if (len > 0  &&  len < strlen (linestring)) {
03770                 this_id = (char *) malloc (len + 1);
03771                 if (this_id == NULL) {
03772                     return eFalse;
03773                 }
03774                 strncpy (this_id, linestring, len);
03775                 this_id [len] = 0;
03776                 arsp = s_FindAlignRawSeqById (afrp->sequences, this_id);
03777                 free (this_id);
03778                 if (arsp != NULL) {
03779                     return eTrue;
03780                 }
03781             }
03782         }
03783     }
03784     return eFalse;
03785 }
03786 
03787 
03788 /* This function analyzes the lines of the block to see if the pattern of
03789  * the lengths of the whitespace-separated pieces of sequence data matches
03790  * for all lines within the block.  The function returns eTrue if this is so,
03791  * otherwise the function returns eFalse.
03792  */
03793 static EBool 
03794 s_BlockIsConsistent
03795 (SAlignRawFilePtr afrp,
03796  TLineInfoPtr     first_line,
03797  int              num_lines_in_block,
03798  EBool            has_ids,
03799  EBool            first_block)
03800 {
03801     TLineInfoPtr   lip;
03802     SLengthListPtr list, this_pattern, best;
03803     int            len, block_offset, id_offset;
03804     char *         tmp_id;
03805     EBool          rval;
03806     char *         cp;
03807 
03808     rval = eTrue;
03809     list = NULL;
03810     for (lip = first_line, block_offset = 0;
03811          lip != NULL  &&  block_offset < num_lines_in_block;
03812          lip = lip->next, block_offset ++)
03813     {
03814         cp = lip->data;
03815         if (has_ids) {
03816             len = strcspn (cp, " \t\r");
03817             if (first_block && len == strlen (cp)) {
03818                 /* PHYLIP IDs are exactly 10 characters long
03819                  * and may not have a space between the ID and
03820                  * the sequence.
03821                  */
03822                 len = 10;
03823             }
03824             tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
03825             if (tmp_id == NULL) {
03826                 return eFalse;
03827             }
03828             strncpy (tmp_id, cp, len);
03829             tmp_id [len] = 0;
03830             id_offset = s_FindAlignRawSeqOffsetById (afrp->sequences, tmp_id);
03831             if (id_offset != block_offset  &&  ! first_block) {
03832                 rval = eFalse;
03833                 s_ReportInconsistentID (tmp_id, lip->line_num,
03834                                       afrp->report_error,
03835                                       afrp->report_error_userdata);
03836             }
03837             free (tmp_id);
03838             cp += len;
03839             cp += strspn (cp, " \t\r");
03840         }
03841         this_pattern = s_GetBlockPattern (cp);
03842         list = s_AddLengthList (list, this_pattern);
03843     }
03844 
03845     /* Now find the pattern with the most appearances */
03846     best = NULL;
03847     for (this_pattern = list;
03848          this_pattern != NULL;
03849          this_pattern = this_pattern->next)
03850     {
03851         if (this_pattern->num_appearances == 0) continue;
03852         if (best == NULL 
03853           ||  this_pattern->num_appearances > best->num_appearances)
03854         {
03855             best = this_pattern;
03856         }
03857     }
03858 
03859     /* now identify and report inconsistent lines */
03860     for (lip = first_line, block_offset = 0;
03861          lip != NULL  &&  block_offset < num_lines_in_block;
03862          lip = lip->next, block_offset ++)
03863     {
03864         cp = lip->data;
03865         if (has_ids) {
03866             len = strcspn (cp, " \t\r");
03867             if (first_block && len == strlen (cp)) {
03868                 /* PHYLIP IDs are exactly 10 characters long
03869                  * and may not have a space between the ID and
03870                  * the sequence.
03871                  */
03872                 len = 10;
03873             }        
03874             tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
03875             if (tmp_id == NULL) {
03876                 return eFalse;
03877             }
03878             strncpy (tmp_id, cp, len);
03879             tmp_id [len] = 0;
03880             cp += len;
03881             cp += strspn (cp, " \t\r");
03882         } else {
03883             tmp_id = s_GetAlignRawSeqIDByOffset (afrp->sequences, block_offset);
03884         }
03885         this_pattern = s_GetBlockPattern (cp);
03886         if ( ! s_DoLengthPatternsMatch (this_pattern, best)) {
03887             rval = eFalse;
03888             s_ReportInconsistentBlockLine (tmp_id, lip->line_num,
03889                                          afrp->report_error,
03890                                          afrp->report_error_userdata);
03891         }
03892         s_LengthListFree (this_pattern);
03893         if (has_ids) {
03894             free (tmp_id);
03895         }
03896     }
03897     s_LengthListFree (list);
03898     return rval;
03899 }
03900 
03901 
03902 /* This function processes a block of lines and adds the sequence data from
03903  * each line in the block to the appropriate sequence in the list.
03904  */
03905 static void 
03906 s_ProcessBlockLines 
03907 (SAlignRawFilePtr afrp,
03908  TLineInfoPtr     lines,
03909  int              num_lines_in_block,
03910  EBool            first_block)
03911 {
03912     TLineInfoPtr    lip;
03913     char *          linestring;
03914     char *          cp;
03915     char *          this_id;
03916     int             len;
03917     int             line_number;
03918     EBool           this_block_has_ids;
03919     int             pos;
03920     TAlignRawSeqPtr arsp;
03921 
03922     this_block_has_ids = s_DoesBlockHaveIds (afrp, lines, num_lines_in_block);
03923     s_BlockIsConsistent (afrp, lines, num_lines_in_block, this_block_has_ids,
03924                        first_block);
03925     for (lip = lines, line_number = 0;
03926          lip != NULL  &&  line_number < num_lines_in_block;
03927          lip = lip->next, line_number ++)
03928     {
03929         linestring = lip->data;
03930         if (linestring != NULL) {
03931             pos = 0;
03932             if (this_block_has_ids) {
03933                 len = strcspn (linestring, " \t\r");
03934                 if (first_block && len == strlen (linestring)) {
03935                     /* PHYLIP IDs are exactly ten characters long,
03936                      * and may not have a space before the start of
03937                      * the sequence data.
03938                      */
03939                     len = 10;
03940                 }
03941                 this_id = (char *) malloc (len + 1);
03942                 if (this_id == NULL) {
03943                     return;
03944                 }
03945                 strncpy (this_id, linestring, len);
03946                 this_id [len] = 0;
03947                 cp = linestring + len;
03948                 pos += len;
03949                 len = strspn (cp, " \t\r");
03950                 cp += len;
03951                 pos += len;
03952                 /* Check for duplicate IDs in the first block */
03953                 if (first_block)
03954                 {
03955                   arsp = s_FindAlignRawSeqById (afrp->sequences, this_id);
03956                   if (arsp != NULL)
03957                   {
03958                     s_ReportDuplicateIDError (this_id, lip->line_num,
03959                                               afrp->report_error,
03960                                               afrp->report_error_userdata);
03961                   }
03962                 }
03963                 afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
03964                                                       this_id, cp,
03965                                                       lip->line_num,
03966                                                       lip->line_num,
03967                                            lip->line_offset + cp - linestring);
03968                 free (this_id);
03969             } else {
03970                 if (! s_AddAlignRawSeqByIndex (afrp->sequences, line_number,
03971                                              linestring, 
03972                                              lip->line_num, lip->line_offset))
03973                 {
03974                     s_ReportBlockLengthError ("", lip->line_num,
03975                                             afrp->block_size,
03976                                             line_number,
03977                                             afrp->report_error,
03978                                             afrp->report_error_userdata);
03979                 }
03980             }
03981         }
03982     }
03983 }
03984 
03985 
03986 /* This function removes comments from the lines of an interleaved block of
03987  * data.
03988  */
03989 static void
03990 s_RemoveCommentsFromBlock
03991 (TLineInfoPtr first_line,
03992  int         num_lines_in_block)
03993 {
03994     TLineInfoPtr lip;
03995     int         block_offset;
03996 
03997     for (lip = first_line, block_offset = 0;
03998          lip != NULL  &&  block_offset < num_lines_in_block;
03999          lip = lip->next)
04000     {                   
04001         s_RemoveCommentFromLine (lip->data);
04002     }
04003 }
04004 
04005 
04006 /* This function processes the interleaved block of data found at each
04007  * location listed in afrp->offset_list.
04008  */
04009 static void s_ProcessAlignRawFileByBlockOffsets (SAlignRawFilePtr afrp)
04010 {
04011     int           line_counter;
04012     TIntLinkPtr   offset_ptr;
04013     TLineInfoPtr  lip;
04014     EBool         first_block = eTrue;
04015     EBool         in_taxa_comment = eFalse;
04016  
04017     if (afrp == NULL) {
04018         return;
04019     }
04020  
04021     line_counter = 0;
04022     offset_ptr = afrp->offset_list;
04023     lip = afrp->line_list;
04024     while (lip != NULL  &&  offset_ptr != NULL
04025            &&  (in_taxa_comment  ||  ! s_FoundStopLine (lip->data))) {
04026         if (in_taxa_comment) {
04027             if (strncmp (lip->data, "end;", 4) == 0) {
04028                 in_taxa_comment = eFalse;
04029             } 
04030         } else if (lip->data != NULL
04031             &&  strncmp (lip->data, "begin taxa;", 11) == 0) {
04032             in_taxa_comment = eTrue;
04033         }
04034         if (line_counter == offset_ptr->ival) {
04035             s_RemoveCommentsFromBlock (lip, afrp->block_size);
04036             s_ProcessBlockLines (afrp, lip, afrp->block_size, first_block);
04037             first_block = eFalse;
04038             offset_ptr = offset_ptr->next;
04039         }
04040         lip = lip->next;
04041         line_counter ++;
04042     }
04043 }
04044 
04045 
04046 /* The following functions are used to analyze contiguous data. */
04047 
04048 static void 
04049 s_CreateSequencesBasedOnTokenPatterns 
04050 (TLineInfoPtr     token_list,
04051  TIntLinkPtr      offset_list,
04052  SLengthListPtr * anchorpattern,
04053  SAlignRawFilePtr afrp)
04054 {
04055     TLineInfoPtr lip;
04056     int          line_counter;
04057     TIntLinkPtr  offset_ptr, next_offset_ptr;
04058     char *       curr_id;
04059     TSizeInfoPtr sip;
04060     int          pattern_line_counter;
04061     int          curr_seg;
04062 
04063     if (token_list == NULL  ||  offset_list == NULL
04064         ||  anchorpattern == NULL 
04065         ||  afrp == NULL)
04066     {
04067         return;
04068     }
04069     for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04070     {
04071         if (anchorpattern [curr_seg] == NULL || anchorpattern [curr_seg]->lengthrepeats == NULL)
04072         {
04073             return;
04074         }
04075     }
04076 
04077     line_counter = 0;
04078     lip = token_list;
04079     offset_ptr = offset_list;
04080     curr_seg = 0;
04081   
04082     for (offset_ptr = offset_list;
04083          offset_ptr != NULL  &&  lip != NULL;
04084          offset_ptr = offset_ptr->next)
04085     {
04086         next_offset_ptr = offset_ptr->next;
04087         while (line_counter < offset_ptr->ival - 1  &&  lip != NULL) {
04088             lip = lip->next;
04089             line_counter ++;
04090         }
04091         if (lip != NULL) {
04092             curr_id = lip->data;
04093             lip = lip->next;
04094             line_counter ++;
04095             for (sip = anchorpattern[curr_seg]->lengthrepeats;
04096                  sip != NULL
04097                    &&  lip != NULL
04098                    &&  (next_offset_ptr == NULL 
04099                      ||  line_counter < next_offset_ptr->ival - 1);
04100                  sip = sip->next)
04101             {
04102                 for (pattern_line_counter = 0;
04103                      pattern_line_counter < sip->num_appearances
04104                          &&  lip != NULL
04105                          &&  (next_offset_ptr == NULL
04106                              ||  line_counter < next_offset_ptr->ival - 1);
04107                      pattern_line_counter ++)
04108                 {
04109                     if (lip->data [0]  !=  ']'  &&  lip->data [0]  != '[') {
04110                         if ((int) strlen (lip->data) != sip->size_value) {
04111                             s_ReportLineLengthError (curr_id, lip, 
04112                                                      sip->size_value,
04113                                                      afrp->report_error,
04114                                                      afrp->report_error_userdata);
04115                         }
04116                         afrp->sequences = s_AddAlignRawSeqById (afrp->sequences, 
04117                                                                 curr_id, 
04118                                                                 lip->data,
04119                                                                 lip->line_num,
04120                                                                 lip->line_num,
04121                                                                 lip->line_offset);
04122                     }
04123                     lip = lip->next;
04124                     line_counter ++;
04125                 }
04126             }
04127             if (sip != NULL  &&  lip != NULL) {
04128                 s_ReportBlockLengthError (curr_id, lip->line_num,
04129                                         afrp->block_size,
04130                                         line_counter - offset_ptr->ival,
04131                                         afrp->report_error,
04132                                         afrp->report_error_userdata);
04133             }
04134         }
04135         curr_seg ++;
04136         if (curr_seg >= afrp->num_segments)
04137         {
04138             curr_seg = 0;
04139         }
04140     }        
04141 }
04142 
04143 
04144 /* The following functions are used for analyzing contiguous data with
04145  * marked IDs.
04146  */
04147 
04148 /* This function creates a new LengthList pattern for each marked ID.
04149  * After each new list is created, the function checks to see if the
04150  * new pattern matches any pattern already in the list of patterns seen.
04151  * If so, the function deletes the new pattern and increments 
04152  * num_appearances for the pattern in the list, otherwise the function
04153  * adds the new pattern to the list.
04154  * When the list is complete, the function finds the pattern with the 
04155  * most appearances and returns that pattern as the anchor pattern to use
04156  * when checking sequence data blocks for consistency with one another.
04157  */
04158 static SLengthListPtr *
04159 s_CreateAnchorPatternForMarkedIDs 
04160 (SAlignRawFilePtr afrp)
04161 {
04162     SLengthListPtr * list;
04163     SLengthListPtr * best;
04164     SLengthListPtr this_pattern;
04165     char *         cp;
04166     TLineInfoPtr   lip;
04167     int            curr_seg;
04168 
04169     if (afrp == NULL) {
04170         return NULL;
04171     }
04172 
04173     /* initialize length lists */
04174     list = (SLengthListPtr *) malloc (afrp->num_segments * sizeof (SLengthListPtr));
04175     if (list == NULL) 
04176     {
04177         return NULL;
04178     }
04179     for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04180     {
04181         list[curr_seg] = NULL;
04182     }
04183     /* initialize best ptrs */
04184     /* list is one element longer, to hold null terminator */
04185     best = (SLengthListPtr *) malloc ((afrp->num_segments + 1) * sizeof (SLengthListPtr));
04186     if (best == NULL) 
04187     {
04188         return NULL;
04189     }
04190     for (curr_seg = 0; curr_seg < afrp->num_segments + 1; curr_seg ++)
04191     {
04192         best[curr_seg] = NULL;
04193     }
04194     
04195     /* initialize pattern */
04196     this_pattern = NULL;
04197 
04198     curr_seg = 0;
04199     for (lip = afrp->line_list;
04200          lip != NULL  &&  ! s_FoundStopLine (lip->data);
04201          lip = lip->next)
04202     {
04203         if (lip->data == NULL) continue;
04204         if (lip->data [0] == ']' || lip->data [0] == '[') continue;
04205         if (lip->data [0] == '>') {
04206             if (this_pattern != NULL) {
04207                 list [curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
04208                 curr_seg ++;
04209                 if (curr_seg >= afrp->num_segments) 
04210                 {
04211                     curr_seg = 0;
04212                 }
04213             }
04214             this_pattern = s_LengthListNew (NULL);
04215             if (this_pattern == NULL) {
04216                 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04217                 {
04218                   s_LengthListFree (list [curr_seg]);
04219                 }
04220                 free (list);
04221                 return NULL;
04222             }
04223             this_pattern->num_appearances = 1;
04224         } else if (this_pattern != NULL) {
04225             /* This section gets rid of base pair number comments */
04226             cp = lip->data;
04227             while ( isspace ((unsigned char)*cp)  ||  isdigit ((unsigned char)*cp)) {
04228                 cp++;
04229             }
04230             s_AddLengthRepeat (this_pattern, strlen (cp));
04231         }
04232     }
04233     if (this_pattern != NULL) {
04234         list[curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
04235     }
04236 
04237     /* Now find the pattern with the most appearances for each segment*/
04238     for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg++)
04239     {
04240         for (this_pattern = list [curr_seg];
04241              this_pattern != NULL;
04242              this_pattern = this_pattern->next)
04243         {
04244             if (this_pattern->num_appearances == 0) continue;
04245             if (best [curr_seg] == NULL 
04246               ||  this_pattern->num_appearances > best[curr_seg]->num_appearances)
04247             {
04248                 best[curr_seg] = this_pattern;
04249             }
04250             
04251         }
04252 
04253         /* free all patterns before and after anchor pattern */
04254         if (best [curr_seg] != NULL) {
04255             s_LengthListFree (best [curr_seg]->next);
04256             best [curr_seg]->next = NULL;
04257         }
04258 
04259         if (best [curr_seg] != list [curr_seg]) {
04260             this_pattern = list [curr_seg];
04261             while ( this_pattern != NULL  &&  this_pattern->next != best[curr_seg] ) {
04262                 this_pattern = this_pattern->next;
04263             }
04264             if (this_pattern != NULL) {
04265                 this_pattern->next = NULL;
04266                 s_LengthListFree (list [curr_seg]);
04267             }
04268         }
04269     }
04270 
04271     for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04272     {
04273         if (best[curr_seg] == NULL) 
04274         {
04275             for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04276             {
04277                 s_LengthListFree (best [curr_seg]);
04278             }
04279             return NULL;
04280         }
04281     }
04282     
04283     return best;
04284 }
04285 
04286 
04287 /* This function removes base pair count comments from the data sections
04288  * for contiguous marked ID sequences.
04289  */
04290 static void s_RemoveBasePairCountCommentsFromData (SAlignRawFilePtr afrp)
04291 {
04292     TIntLinkPtr  this_offset, next_offset;
04293     TLineInfoPtr lip;
04294     int          line_count;
04295     char *       cp;
04296 
04297     if (afrp == NULL  ||  afrp->offset_list == NULL) {
04298         return;
04299     }
04300     this_offset = afrp->offset_list;
04301     next_offset = this_offset->next;
04302     lip = afrp->line_list;
04303     line_count = 0;
04304     while (lip != NULL  &&  this_offset != NULL) {
04305         if (line_count == this_offset->ival) {
04306             while (lip != NULL  && 
04307                    (next_offset == NULL
04308                    ||  line_count < next_offset->ival - 1)) {
04309                 cp = lip->data;
04310                 if (cp != NULL) {
04311                     cp += strspn (cp, " \t\r\n1234567890");
04312                     if (cp != lip->data) {
04313                         strcpy (lip->data, cp);
04314                     }
04315                 }
04316                 line_count ++;
04317                 lip = lip->next;
04318             }
04319             this_offset = this_offset->next;
04320             if (this_offset != NULL) {
04321                 next_offset = this_offset->next;
04322             }
04323         } else {
04324             line_count ++;
04325             lip = lip->next;
04326         }
04327     }
04328 }
04329 
04330  
04331 /* This function assumes that the offset_list has already been populated
04332  * with the locations of the data blocks.  It analyzes the blocks of data
04333  * to find the most frequently occurring pattern of lengths of data and then
04334  * uses that pattern to attach the data to the correct IDs and report any
04335  * errors in formatting.
04336  */
04337 static void s_ProcessAlignFileRawForMarkedIDs (SAlignRawFilePtr afrp)
04338 {
04339     SLengthListPtr * anchorpattern;
04340     
04341     if (afrp == NULL) {
04342         return;
04343     }
04344 
04345     s_RemoveBasePairCountCommentsFromData (afrp);
04346     anchorpattern = s_CreateAnchorPatternForMarkedIDs (afrp);
04347     if (anchorpattern == NULL  ||  afrp->offset_list == NULL) {
04348         return;
04349     }
04350     s_CreateSequencesBasedOnTokenPatterns (afrp->line_list, afrp->offset_list,
04351                                          anchorpattern, afrp);
04352 }
04353 
04354 
04355 /* The following functions are used for analyzing contiguous sequence data
04356  * without marked IDs.
04357  */
04358 
04359 /* This function left-shifts a string, character by character. */
04360 static void
04361 s_StringLeftShift
04362 (char * cp_from,
04363  char * cp_to)
04364 {
04365     if (cp_from == cp_to  ||  cp_from == NULL  ||  cp_to == NULL) {
04366         return;
04367     }
04368     while (*cp_to != 0) {
04369         *cp_from = *cp_to;
04370         cp_from++;
04371         cp_to++;
04372     }
04373     *cp_from = 0;
04374 }
04375 
04376 
04377 /* This function removes bracketed comments from a linked list of 
04378  * SLineInfo structures.  The function returns a pointer to the
04379  * list without the comments.
04380  */
04381 static TLineInfoPtr s_RemoveCommentsFromTokens (TLineInfoPtr list)
04382 {
04383     TLineInfoPtr  lip;
04384     int           num_comment_starts;
04385     char *        cp_l;
04386     char *        cp_r;
04387     char *        cp;
04388     EBool         in_comment;
04389 
04390     num_comment_starts = 0;
04391     in_comment = eFalse;
04392     for (lip = list;  lip != NULL;  lip = lip->next) {
04393         if (lip->data == NULL) {
04394             lip->delete_me = eTrue;
04395         } else {
04396             cp_l = NULL;
04397             cp_r = NULL;
04398             for (cp = lip->data; *cp != 0; cp++) {
04399                 if (*cp == ']') {
04400                     if (cp_r == NULL) {
04401                         s_StringLeftShift (lip->data, cp + 1);
04402                         cp = lip->data - 1;
04403                     } else {
04404                         s_StringLeftShift (cp_r, cp + 1);
04405                         cp = cp_r;
04406                         if (cp_r > lip->data) {
04407                             cp_r --;
04408                             while (cp_r >= lip->data  &&  *cp_r != '[') {
04409                                 cp_r --;
04410                             }
04411                             if (cp_r < lip->data) {
04412                                 cp_r = NULL;
04413                             }
04414                         } else {
04415                             cp_r = NULL;
04416                         }
04417                     }
04418                     if (num_comment_starts > 0) {
04419                         num_comment_starts --;
04420                     }
04421                 } else if (*cp == '[') {
04422                     cp_r = cp;
04423                     num_comment_starts ++;
04424                 }
04425             }
04426             if (in_comment) {
04427                 if (num_comment_starts == 0) {
04428                     in_comment = eFalse;
04429                 } else {
04430                     lip->delete_me = eTrue;
04431                 }
04432             } else if (num_comment_starts > 0) {
04433                 cp_r = strchr (lip->data, '[');
04434                 if (cp_r != NULL) {
04435                     *cp_r = 0;
04436                 }
04437                 in_comment = eTrue;
04438             }
04439             if (lip->data [0] == 0) {
04440                 lip->delete_me = eTrue;
04441             }
04442         }
04443     }
04444     list = s_DeleteLineInfos (list);
04445     return list;
04446 }
04447 
04448 
04449 /* This function removes Nexus comments from a linked list of SLineInfo
04450  * structures.  The function returns a pointer to the list without the
04451  * comments.
04452  */
04453 static TLineInfoPtr s_RemoveNexusCommentsFromTokens (TLineInfoPtr list) 
04454 {
04455     TLineInfoPtr lip, start_lip, end_lip;
04456 
04457     lip = list;
04458     start_lip = NULL;
04459     end_lip = NULL;
04460     while (lip != NULL) {
04461         if (s_StringICmp (lip->data, "#NEXUS") == 0) {
04462             start_lip = lip;
04463             end_lip = lip;
04464             while (end_lip != NULL 
04465                    &&  s_StringICmp (end_lip->data, "matrix") != 0) {
04466                 end_lip = end_lip->next;
04467             }
04468             if (end_lip != NULL) {
04469                 while (start_lip != end_lip) {
04470                     start_lip->delete_me = eTrue;
04471                     start_lip = start_lip->next;
04472                 }
04473                 end_lip->delete_me = eTrue;
04474                 lip = end_lip->next;
04475             } else {
04476                 lip = lip->next;
04477             }
04478         } else {
04479             lip = lip->next;
04480         }
04481     }
04482     list = s_DeleteLineInfos (list);
04483     return list;
04484 }
04485 
04486 
04487 /* This function finds the number of characters that occur most frequently
04488  * in a token and returns a pointer to a SSizeInfo structure that
04489  * describes the most common length and the number of times it appears.
04490  */
04491 static TSizeInfoPtr 
04492 s_FindMostFrequentlyOccurringTokenLength
04493 (TSizeInfoPtr list,
04494  int          not_this_size)
04495 {
04496     TSizeInfoPtr list_ptr, new_list, best_ptr, return_best;
04497 
04498     new_list = NULL;
04499     for (list_ptr = list;  list_ptr != NULL;  list_ptr = list_ptr->next) {
04500         if (not_this_size != list_ptr->size_value) {
04501             new_list = s_AddSizeInfoAppearances (new_list,
04502                                                list_ptr->size_value,
04503                                                list_ptr->num_appearances);
04504         }
04505     }
04506     best_ptr = s_GetMostPopularSizeInfo (new_list);
04507     return_best = NULL;
04508     if (best_ptr != NULL) {
04509         return_best = s_SizeInfoNew (NULL);
04510         if (return_best != NULL) {
04511             return_best->size_value = best_ptr->size_value;
04512             return_best->num_appearances = best_ptr->num_appearances;
04513         }
04514     }
04515     s_SizeInfoFree (new_list);
04516     return return_best;
04517 }
04518 
04519 
04520 /* This function examines all instances of an anchor pattern in the data
04521  * and checks to see if the line immediately after the anchor pattern should
04522  * be used as part of the anchor pattern.  This function exists because 
04523  * frequently, but not always, contiguous data will consist of multiple lines
04524  * of data of the same length (for example, 80 characters), followed by one
04525  * shorter line with the remaining data.  We must also make sure that we do
04526  * not accidentally include the ID of the next sequence in the data of the
04527  * previous sequence.
04528  */
04529 static void 
04530 s_ExtendAnchorPattern 
04531 (SLengthListPtr anchorpattern,
04532  TSizeInfoPtr   line_lengths)
04533 {
04534     TSizeInfoPtr last_line_lengths, sip, sip_next, twoafter;
04535     int         best_last_line_length;
04536     int         anchor_line_length;
04537 
04538     if (anchorpattern == NULL 
04539         ||  anchorpattern->lengthrepeats == NULL
04540         ||  line_lengths == NULL) {
04541        return;
04542     }
04543 
04544     last_line_lengths = NULL;
04545     anchor_line_length = anchorpattern->lengthrepeats->size_value;
04546 
04547     /* also check to make sure that there's more than one line between
04548      * this pattern and the next pattern, otherwise the next line is the
04549      * ID for the next pattern and shouldn't be included in the anchor
04550      */
04551     for (sip = line_lengths;  sip != NULL;  sip = sip->next) {
04552         if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
04553             sip_next = sip->next;
04554             if (sip_next != NULL
04555                 &&  sip_next->size_value > 0
04556                 &&  sip_next->size_value != anchor_line_length
04557                 &&  ((twoafter = sip_next->next) == NULL
04558                    ||  twoafter->size_value != anchor_line_length))
04559             {
04560                 last_line_lengths = s_AddSizeInfo (last_line_lengths,
04561                                                  sip_next->size_value);
04562             }
04563         }
04564     }
04565     best_last_line_length = s_GetMostPopularSize (last_line_lengths);
04566     if (best_last_line_length > 0) {
04567         s_AddLengthRepeat (anchorpattern, best_last_line_length);
04568     }
04569     s_SizeInfoFree (last_line_lengths);
04570 } 
04571 
04572 
04573 /* This function looks for the most frequently occurring pattern, where a 
04574  * pattern is considered to be N contiguous tokens of M characters.  The
04575  * function then checks to see if there is usually a token of a particular
04576  * length that immediately follows this pattern that is not the ID for the
04577  * next sequence.  If so, this line length is added to the pattern.
04578  * The function returns a pointer to this pattern.
04579  */
04580 static SLengthListPtr s_FindMostPopularPattern (TSizeInfoPtr list)
04581 {
04582     SLengthListPtr patternlist, newpattern;
04583     TSizeInfoPtr   sip, popular_line_length;
04584     SLengthListPtr index, best;
04585     int           not_this_length;
04586 
04587     patternlist = NULL;
04588     for (sip = list;  sip != NULL;  sip = sip->next) {
04589         if (sip->size_value > 0) {
04590             newpattern = s_LengthListNew (NULL);
04591             if (newpattern == NULL) {
04592                 s_LengthListFree (patternlist);
04593                 return NULL;
04594             }
04595             newpattern->num_appearances = 1;
04596             newpattern->lengthrepeats = s_SizeInfoNew (NULL);
04597             if (newpattern->lengthrepeats == NULL) {
04598                 s_LengthListFree (patternlist);
04599                 return NULL;
04600             }
04601             newpattern->lengthrepeats->size_value = sip->size_value;
04602             newpattern->lengthrepeats->num_appearances = sip->num_appearances;
04603             patternlist = s_AddLengthList (patternlist, newpattern);
04604         }
04605     }
04606     if (patternlist == NULL) {
04607         return NULL;
04608     }
04609 
04610     best = NULL;
04611     for (index = patternlist;  index != NULL;  index = index->next) {
04612         if (index->lengthrepeats->num_appearances < 2) {
04613             continue;
04614         }
04615         if (best==NULL  ||  best->num_appearances < index->num_appearances) {
04616             best = index;
04617         } else if (best->num_appearances == index->num_appearances
04618             &&  best->lengthrepeats->size_value < 
04619                                   index->lengthrepeats->size_value) {
04620             best = index;
04621         }
04622     }
04623 
04624     /* Free data in list before best pattern */
04625     index = patternlist;
04626     while ( index != NULL  &&  index->next != best ) {
04627          index = index->next;
04628     }
04629     if (index != NULL) {
04630         index->next = NULL;
04631         s_LengthListFree (patternlist);
04632     }
04633     /* Free data in list after best pattern */
04634     if (best != NULL) {
04635         s_LengthListFree (best->next);
04636         best->next = NULL;
04637     }
04638 
04639     popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list, 0);
04640 
04641     if (best != NULL  &&  best->lengthrepeats != NULL
04642       &&  popular_line_length != NULL
04643       &&  best->lengthrepeats->size_value == popular_line_length->size_value)
04644     {
04645         not_this_length = popular_line_length->size_value;
04646         s_SizeInfoFree (popular_line_length);
04647         popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list,
04648                                                         not_this_length);
04649     }
04650 
04651     if (best == NULL
04652         ||  (popular_line_length != NULL
04653           &&  popular_line_length->size_value > best->lengthrepeats->size_value
04654           &&  popular_line_length->num_appearances > best->num_appearances))
04655     {
04656         if (best == NULL) {
04657             best = s_LengthListNew (NULL);
04658             if (best == NULL) {
04659                 return NULL;
04660             }
04661         }
04662         best->lengthrepeats = s_SizeInfoNew (NULL);
04663         if (best->lengthrepeats == NULL) {
04664             return NULL;
04665         }
04666         best->lengthrepeats->size_value = popular_line_length->size_value;
04667         best->lengthrepeats->num_appearances = 1;
04668     } else {
04669         /* extend anchor pattern to include best length of last line */
04670         s_ExtendAnchorPattern (best, list);
04671     }
04672 
04673     s_SizeInfoFree (popular_line_length);
04674 
04675     return best;
04676 }
04677 
04678 
04679 /* This function creates an SIntLink list to describe the locations
04680  * of occurrences of the anchorpattern in the SSizeInfo list.
04681  * The function returns a pointer to the SIntLink list.
04682  */
04683 static TIntLinkPtr 
04684 s_CreateOffsetList 
04685 (TSizeInfoPtr list,
04686  SLengthListPtr anchorpattern)
04687 {
04688     int          line_counter;
04689     TIntLinkPtr  offset_list, new_offset;
04690     TSizeInfoPtr sip, prev_sip;
04691 
04692     if (list == NULL  ||  anchorpattern == NULL) {
04693         return NULL;
04694     }
04695     line_counter = 0;
04696     offset_list = NULL;
04697     prev_sip = NULL;
04698     for (sip = list;  sip != NULL;  sip = sip->next) {
04699         if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
04700             new_offset = s_IntLinkNew (line_counter, offset_list);
04701             if (new_offset == NULL) {
04702                 s_IntLinkFree (offset_list);
04703                 return NULL;
04704             }
04705             if (offset_list == NULL) {
04706                 offset_list = new_offset;
04707             }
04708         }
04709  
04710         line_counter += sip->num_appearances;
04711         prev_sip = sip;
04712     }
04713     return offset_list;
04714 }
04715 
04716 
04717 /* This function determines whether or not the number of expected sequence
04718  * characters are available starting at a token after line_start and stopping
04719  * at least one token before the next known sequence data block in the list.
04720  * If so, the function returns the number of the token at which the sequence
04721  * data begins.  Otherwise the function returns -1.
04722  */
04723 static int  
04724 s_ForecastPattern 
04725 (int          line_start,
04726  int          pattern_length,
04727  TIntLinkPtr  next_offset,
04728  int          sip_offset,
04729  TSizeInfoPtr list)
04730 {
04731     int  offset, end_offset;
04732     TSizeInfoPtr sip;
04733     int  line_counter, num_chars;
04734   
04735     if (list == NULL) {
04736         return -1;
04737     }
04738 
04739     for (offset = sip_offset; offset < list->num_appearances; offset++) {
04740         line_counter = line_start + offset;
04741         num_chars = list->size_value * (list->num_appearances - offset); 
04742         sip = list;
04743         while (num_chars < pattern_length
04744                 &&  (next_offset == NULL  ||  line_counter < next_offset->ival)
04745                 &&  sip->next != NULL)
04746         {
04747             sip = sip->next;
04748             for (end_offset = 0;
04749                  end_offset < sip->num_appearances
04750                      &&  num_chars < pattern_length
04751                      &&  (next_offset == NULL
04752                           ||  line_counter < next_offset->ival);
04753                  end_offset ++)
04754             {
04755                 num_chars += sip->size_value;
04756                 line_counter ++;
04757             }
04758         }
04759         if (num_chars == pattern_length) {
04760             return line_start + offset;
04761         }
04762     }
04763     return -1;
04764 }
04765 
04766 
04767 /* This function examines the offset list and searches for holes where blocks
04768  * of sequence data without the exact expected formatting might exist.  The
04769  * function adds the offsets of any new blocks to the list and returns a
04770  * pointer to the augmented offset list.
04771  */
04772 static TIntLinkPtr 
04773 s_AugmentOffsetList 
04774 (TIntLinkPtr    offset_list,
04775  TSizeInfoPtr   list,
04776  SLengthListPtr anchorpattern)
04777 {
04778     int           pattern_length;
04779     TSizeInfoPtr  sip;
04780     TIntLinkPtr   prev_offset, next_offset, new_offset;
04781     int           line_counter, forecast_position, line_skip;
04782     EBool         skipped_previous = eFalse;
04783     int           num_chars;
04784     int           num_additional_offsets = 0;
04785     int           max_additional_offsets = 5000; /* if it's that bad, forget it */
04786 
04787     if (list == NULL  ||  anchorpattern == NULL) {
04788         return offset_list;
04789     }
04790 
04791     pattern_length = 0;
04792     for (sip = anchorpattern->lengthrepeats;  sip != NULL;  sip = sip->next) {
04793         pattern_length += (sip->size_value * sip->num_appearances);
04794     }
04795     if (pattern_length == 0) {
04796         return offset_list;
04797     }
04798 
04799     prev_offset = NULL;
04800     next_offset = offset_list;
04801     line_counter = 0;
04802     sip = list;
04803     while (sip != NULL  &&  num_additional_offsets < max_additional_offsets) {
04804         /* if we are somehow out of synch, don't get caught in infinite loop */
04805         if (next_offset != NULL  &&  line_counter > next_offset->ival) {
04806             next_offset = next_offset->next;
04807         } else if (next_offset != NULL  &&  line_counter == next_offset->ival) {
04808             skipped_previous = eFalse;
04809             prev_offset = next_offset;
04810             next_offset = next_offset->next;
04811             /* advance sip and line counter past the end of this pattern */
04812             num_chars = 0;
04813             while (num_chars < pattern_length  &&  sip != NULL) {
04814                 num_chars += sip->size_value * sip->num_appearances;
04815                 line_counter += sip->num_appearances;
04816                 sip = sip->next;
04817             }
04818         } else if (skipped_previous) {
04819             line_skip = 0;
04820             while (sip != NULL  &&  line_skip < sip->num_appearances 
04821                   &&  num_additional_offsets < max_additional_offsets
04822                   &&  (next_offset == NULL
04823                        ||  line_counter < next_offset->ival)) {
04824                 /* see if we can build a pattern that matches the pattern 
04825                  * length we want
04826                  */
04827                 forecast_position = s_ForecastPattern (line_counter,
04828                                                      pattern_length,
04829                                                      next_offset, line_skip,
04830                                                      sip);
04831                 if (forecast_position > 0) {
04832                     new_offset = s_IntLinkNew (forecast_position, NULL);
04833                     num_additional_offsets++;
04834                     if (new_offset == NULL) {
04835                         return NULL;
04836                     }
04837                     if (prev_offset == NULL) {
04838                         new_offset->next = offset_list;
04839                         offset_list = new_offset;
04840                     } else {
04841                         new_offset->next = next_offset;
04842                         prev_offset->next = new_offset;
04843                     }
04844                     prev_offset = new_offset;
04845                     /* now advance sip and line counter past the end 
04846                      * of the pattern we have just created
04847                      */
04848                     num_chars = 0;
04849                     while (num_chars < pattern_length  &&  sip != NULL) {
04850                         for (line_skip = 0;
04851                              line_skip < sip->num_appearances
04852                                  &&  num_chars < pattern_length;
04853                              line_skip++)
04854                         {
04855                             num_chars += sip->size_value;
04856                             line_counter ++;
04857                         }
04858                         if (line_skip == sip->num_appearances) {
04859                             sip = sip->next;
04860                             line_skip = 0;
04861                         }
04862                     }
04863                 } else {
04864                     line_counter += sip->num_appearances;
04865                     sip = sip->next;
04866                     line_skip = 0;
04867                 }
04868             }
04869         } else {
04870             skipped_previous = eTrue;
04871             line_counter += sip->num_appearances;
04872             sip = sip->next;
04873         }
04874     }
04875     if (num_additional_offsets >= max_additional_offsets)
04876     {
04877       s_IntLinkFree (offset_list);
04878       offset_list = NULL;
04879     }
04880     return offset_list;
04881 }
04882 
04883 
04884 /* This function finds the most frequently occurring distance between
04885  * two sequence data blocks and returns that value.
04886  */
04887 static int  s_GetMostPopularPatternLength (TIntLinkPtr offset_list)
04888 {
04889     int          line_counter, best_length;
04890     TSizeInfoPtr pattern_length_list;
04891     TIntLinkPtr  offset;
04892 
04893     if (offset_list == NULL) {
04894         return -1;
04895     }
04896 
04897     line_counter = -1;
04898     pattern_length_list = NULL;
04899     for (offset = offset_list;  offset != NULL;  offset = offset->next) {
04900         if (line_counter != -1) {
04901             pattern_length_list = s_AddSizeInfo (pattern_length_list,
04902                                                offset->ival - line_counter);
04903         }
04904         line_counter = offset->ival;
04905     }
04906     best_length = s_GetMostPopularSize (pattern_length_list);
04907     s_SizeInfoFree (pattern_length_list);
04908     return best_length;
04909 }
04910 
04911 
04912 /* This function finds the most frequently appearing number of characters 
04913  * in a block of sequence data and returns that value.
04914  */
04915 static int 
04916 s_GetBestCharacterLength 
04917 (TLineInfoPtr token_list,
04918  TIntLinkPtr  offset_list,
04919  int          block_length)
04920 {
04921     TLineInfoPtr lip;
04922     TIntLinkPtr  prev_offset, new_offset;
04923     int          line_diff, num_chars, best_num_chars;
04924     TSizeInfoPtr pattern_length_list = NULL;
04925 
04926     if (token_list == NULL  ||  offset_list == NULL  ||  block_length < 1) {
04927         return -1;
04928     }
04929     /* get length of well-formatted block size */
04930     lip = token_list;
04931     prev_offset = NULL;
04932     for (new_offset = offset_list;
04933          new_offset != NULL  &&  lip != NULL;
04934          new_offset = new_offset->next)
04935     {
04936         if (prev_offset == NULL) {
04937             /* skip first tokens */
04938             for (line_diff = 0;
04939                  line_diff < new_offset->ival  &&  lip != NULL;
04940                  line_diff ++)
04941             {
04942                 lip = lip->next;
04943             }
04944         }
04945         if (prev_offset != NULL) {
04946             num_chars = 0;
04947             for (line_diff = 0;
04948                  line_diff < new_offset->ival - prev_offset->ival
04949                      &&  lip != NULL;
04950                  line_diff ++)
04951             {
04952                 if (line_diff < new_offset->ival - prev_offset->ival - 1) {
04953                     num_chars += strlen (lip->data);
04954                 }
04955                 lip = lip->next;
04956             }
04957             if (new_offset->ival - prev_offset->ival == block_length) {
04958                 pattern_length_list = s_AddSizeInfo (pattern_length_list,
04959                                                    num_chars);
04960             }
04961         }
04962         prev_offset = new_offset;
04963     }
04964     best_num_chars = s_GetMostPopularSize (pattern_length_list);
04965     if (best_num_chars == 0  &&  pattern_length_list != NULL) {
04966         best_num_chars = pattern_length_list->size_value;
04967     }
04968     s_SizeInfoFree (pattern_length_list);
04969     pattern_length_list = NULL;
04970     return best_num_chars;
04971 }
04972 
04973 
04974 static int  
04975 s_CountCharactersBetweenOffsets 
04976 (TLineInfoPtr list,
04977  int          distance,
04978  int          desired_num_chars)
04979 {
04980     int          line_diff, num_chars, total_chars, pattern_length, num_starts;
04981     TLineInfoPtr lip;
04982     TIntLinkPtr  length_list, start_list, start_ptr, length;
04983     int          start_of_unknown;
04984     int          num_additional_offsets_needed;
04985 
04986     if (list == NULL  ||  distance == 0  ||  desired_num_chars == 0) {
04987         return 0;
04988     }
04989 
04990     /* because the first offset is the start of a known pattern, we should
04991      * skip to the end of that pattern and start looking for additional
04992      * offsets
04993      */
04994     total_chars = 0;
04995     for (lip = list, line_diff = 0;
04996          lip != NULL  &&  line_diff < distance
04997              &&  total_chars < desired_num_chars;
04998          lip = lip->next, line_diff++) {
04999         num_chars = strlen (lip->data);
05000         total_chars += num_chars;
05001     }
05002     while (lip != NULL && line_diff < distance  &&  s_IsBlank (lip->data)) {
05003         lip = lip->next;
05004         line_diff ++;
05005     }
05006     /* skip over line we would need for ID */
05007     if (lip != NULL) {
05008         lip = lip->next;
05009         line_diff ++;
05010     }
05011   
05012     if (lip == NULL  ||  line_diff == distance) {
05013         return 0;
05014     }
05015     num_starts = 1;
05016     list = lip->next;
05017     start_of_unknown = line_diff;
05018 
05019     length_list = NULL;
05020     total_chars = 0;
05021     for (lip = list;
05022          lip != NULL  &&  line_diff < distance;
05023          lip = lip->next, line_diff++)
05024     {
05025         num_chars = strlen (lip->data);
05026         length = s_IntLinkNew (num_chars, length_list);
05027         if (length_list == NULL) {
05028             length_list = length;
05029         }
05030         total_chars += num_chars;
05031     }
05032 
05033     /* how many offsets do we need? */
05034     num_additional_offsets_needed = (total_chars / desired_num_chars);
05035     if (num_additional_offsets_needed == 0) {
05036         return 0;
05037     }
05038 
05039     /* Find all the places you could start and get the exact right number
05040      * of characters
05041      */
05042     start_list = NULL;
05043     num_starts = 0;
05044     pattern_length = 0;
05045     for (start_ptr = length_list, line_diff = start_of_unknown;
05046          start_ptr != NULL  &&  line_diff < distance
05047            &&  pattern_length < distance - line_diff ;
05048          start_ptr = start_ptr->next, line_diff++) {
05049         num_chars = start_ptr->ival;
05050         pattern_length = 1;
05051         length = start_ptr->next;
05052         while (num_chars < desired_num_chars
05053                &&  pattern_length + line_diff < distance
05054                &&  length != NULL)
05055         {
05056             num_chars += length->ival;
05057             pattern_length ++;
05058             length = length->next;
05059         }
05060         if (num_chars == desired_num_chars) {
05061             length = s_IntLinkNew (line_diff, start_list);
05062             if (start_list == NULL) {
05063                 start_list = length;
05064             }
05065             num_starts ++;
05066         }
05067     }
05068 
05069     /* now select best set of start points */
05070     
05071     s_IntLinkFree (length_list);
05072     s_IntLinkFree (start_list);
05073     return 0;
05074 }
05075 
05076 
05077 /* This function inserts new block locations into the offset_list
05078  * by looking for likely starts of abnormal patterns.
05079  */
05080 static void s_InsertNewOffsets
05081 (TLineInfoPtr token_list,
05082  TIntLinkPtr  offset_list,
05083  int          block_length,
05084  int          best_num_chars,
05085  char *       alphabet)
05086 {
05087     TLineInfoPtr lip, prev_start;
05088     TIntLinkPtr  prev_offset, new_offset, splice_offset;
05089     int          line_diff, num_chars, line_start;
05090 
05091     if (token_list == NULL  ||  offset_list == NULL
05092         ||  block_length < 1  ||  best_num_chars < 1)
05093     {
05094         return;
05095     }
05096 
05097     lip = token_list;
05098     prev_offset = NULL;
05099     for (new_offset = offset_list;
05100          new_offset != NULL  &&  lip != NULL;
05101          new_offset = new_offset->next) {
05102         if (prev_offset == NULL) {
05103             /* just advance through tokens */
05104             for (line_diff = 0;
05105                  line_diff < new_offset->ival  &&  lip != NULL;
05106                  line_diff ++) {
05107                 lip = lip->next;
05108             }
05109         } else {
05110             if (new_offset->ival - prev_offset->ival == block_length) {
05111                 /* just advance through tokens */
05112                 for (line_diff = 0;
05113                      line_diff < new_offset->ival - prev_offset->ival 
05114                          &&  lip != NULL;
05115                      line_diff ++) {
05116                     lip = lip->next;
05117                 }
05118             } else {
05119                 /* look for intermediate breaks */
05120                 prev_start = lip;
05121                 num_chars = 0;
05122                 for (line_diff = 0;
05123                      line_diff < new_offset->ival - prev_offset->ival
05124                          &&  lip != NULL  &&  num_chars < best_num_chars;
05125                      line_diff ++) {
05126                     num_chars += strlen (lip->data);
05127                     lip = lip->next;
05128                 }
05129                 if (lip == NULL) {
05130                   return;
05131                 }
05132                 /* set new offset at first line of next pattern */
05133                 line_diff ++;
05134                 lip = lip->next;
05135                 if (line_diff < new_offset->ival - prev_offset->ival) {
05136                     line_start = line_diff + prev_offset->ival;
05137                     /* advance token pointer to new piece */
05138                     while (line_diff < new_offset->ival - prev_offset->ival
05139                            &&  lip != NULL)
05140                     {
05141                         lip = lip->next;
05142                         line_diff ++;
05143                     }
05144                     /* insert new offset value */
05145                     splice_offset = s_IntLinkNew (line_start, NULL);
05146                     if (splice_offset == NULL) {
05147                         return;
05148                     }
05149                     splice_offset->next = new_offset;
05150                     prev_offset->next = splice_offset;
05151 
05152                     s_CountCharactersBetweenOffsets (lip,
05153                                        new_offset->ival - splice_offset->ival,
05154                                        best_num_chars);
05155                 }
05156             }
05157         }
05158         prev_offset = new_offset;
05159     }
05160     
05161     /* iterate through the last block */
05162     for (line_diff = 0;
05163          line_diff < block_length && lip != NULL; 
05164          line_diff ++) {
05165         lip = lip->next;
05166     }
05167 
05168     /* if we have room for one more sequence, or even most of one more sequence, add it */
05169     if (lip != NULL  &&  ! s_SkippableString (lip->data)) {
05170         splice_offset = s_IntLinkNew (line_diff + prev_offset->ival, prev_offset);
05171     }
05172 }
05173 
05174 
05175 /* This function returns true if the string contains digits, false otherwise */
05176 static EBool s_ContainsDigits (char *data)
05177 {
05178     char *cp;
05179 
05180     if (data == NULL) return eFalse;
05181     for (cp = data; *cp != 0; cp++) {
05182         if (isdigit ((unsigned char)(*cp))) {
05183             return eTrue;
05184         }
05185     }
05186     return eFalse;
05187 }
05188 
05189 
05190 /* This function processes the alignment file data by dividing the original
05191  * lines into pieces based on whitespace and looking for patterns of length 
05192  * in the data. 
05193  */
05194 static void s_ProcessAlignFileRawByLengthPattern (SAlignRawFilePtr afrp)
05195 {
05196     TLineInfoPtr   token_list;
05197     SLengthListPtr list;
05198     TLineInfoPtr   lip;
05199     SLengthListPtr anchorpattern[2];
05200     TIntLinkPtr    offset_list;
05201     int            best_length;
05202     int            best_num_chars;
05203 
05204     if (afrp == NULL  ||  afrp->line_list == NULL) {
05205         return;
05206     }
05207 
05208     token_list = s_BuildTokenList (afrp->line_list);
05209     token_list = s_RemoveCommentsFromTokens (token_list);
05210     token_list = s_RemoveNexusCommentsFromTokens (token_list);
05211 
05212     list = s_LengthListNew ( NULL );
05213     for (lip = token_list;
05214          lip != NULL  &&  ! s_FoundStopLine (lip->data);
05215          lip = lip->next)
05216     {
05217         if (s_SkippableString (lip->data)  ||  s_ContainsDigits(lip->data)) {
05218             s_AddLengthRepeat (list, 0);
05219         } else {
05220             s_AddLengthRepeat (list, strlen (lip->data));
05221         }
05222     }
05223 
05224     anchorpattern [0] = s_FindMostPopularPattern (list->lengthrepeats);
05225     anchorpattern [1] = NULL;
05226     if (anchorpattern [0] == NULL  ||  anchorpattern[0]->lengthrepeats == NULL) {
05227         return;
05228     }
05229 
05230     /* find anchor patterns in original list, 
05231      * find distances between anchor patterns 
05232      */
05233     offset_list = s_CreateOffsetList (list->lengthrepeats, anchorpattern[0]);
05234     offset_list = s_AugmentOffsetList (offset_list,
05235                                      list->lengthrepeats,
05236                                      anchorpattern[0]);
05237 
05238     /* resolve unusual distances between anchor patterns */
05239     best_length = s_GetMostPopularPatternLength (offset_list);
05240     if (best_length < 1  &&  offset_list != NULL  && offset_list->next != NULL) {
05241         best_length = offset_list->next->ival - offset_list->ival;
05242     }
05243     best_num_chars = s_GetBestCharacterLength (token_list, offset_list,
05244                                              best_length);
05245     s_InsertNewOffsets (token_list, offset_list, best_length, best_num_chars,
05246                       afrp->alphabet);
05247 
05248     /* use token before each anchor pattern as ID, use tokens for distance
05249      * between anchor patterns for sequence data
05250      */
05251     s_CreateSequencesBasedOnTokenPatterns (token_list, offset_list,
05252                                        anchorpattern, afrp);
05253   
05254     s_LengthListFree (anchorpattern[0]);
05255     s_LengthListFree (list);
05256     s_LineInfoFree (token_list);
05257 }
05258 
05259 
05260 /* The following functions are used to convert data from the internal
05261  * representation into the form that will be passed to the calling
05262  * program.  Information from the ID strings is parsed to remove
05263  * definition lines and organism information, the gap characters are
05264  * standardized to '-', the missing characters are standardizes to 'N',
05265  * match characters are replaced with characters from the first record,
05266  * and bad characters are reported.
05267  */
05268 
05269 /* This function allocates memory for a new AligmentFileData structure
05270  * and initializes its member variables.
05271  */
05272 extern TAlignmentFilePtr AlignmentFileNew (void)
05273 {
05274     TAlignmentFilePtr afp;
05275 
05276     afp = (TAlignmentFilePtr) malloc (sizeof (SAlignmentFile));
05277     if (afp == NULL) {
05278         return NULL;
05279     }
05280     afp->num_sequences = 0;
05281     afp->num_organisms = 0;
05282     afp->num_deflines  = 0;
05283     afp->num_segments  = 0;
05284     afp->ids           = NULL;
05285     afp->sequences     = NULL;
05286     afp->organisms     = NULL;
05287     afp->deflines      = NULL;
05288     return afp;
05289 }
05290 
05291 
05292 /* This function frees the memory associated with an AligmentFileData
05293  * structure and its member variables.
05294  */
05295 extern void AlignmentFileFree (TAlignmentFilePtr afp)
05296 {
05297     int  index;
05298 
05299     if (afp == NULL) {
05300         return;
05301     }
05302     if (afp->ids != NULL) {
05303         for (index = 0;  index < afp->num_sequences;  index++) {
05304             free (afp->ids [index]);
05305         }  
05306         free (afp->ids);
05307         afp->ids = NULL;
05308     }
05309     if (afp->sequences != NULL) {
05310         for (index = 0;  index < afp->num_sequences;  index++) {
05311             free (afp->sequences [index]);
05312         }  
05313         free (afp->sequences);
05314         afp->sequences = NULL;
05315     }
05316     if (afp->organisms != NULL) {
05317         for (index = 0;  index < afp->num_organisms;  index++) {
05318             free (afp->organisms [index]);
05319         }  
05320         free (afp->organisms);
05321         afp->sequences = NULL;
05322     }
05323     if (afp->deflines != NULL) {
05324         for (index = 0;  index < afp->num_deflines;  index++) {
05325             free (afp->deflines [index]);
05326         }  
05327         free (afp->deflines);
05328         afp->deflines = NULL;
05329     }
05330     free (afp);
05331 }
05332 
05333 
05334 /* This function parses the identifier string used by the alignment file
05335  * to identify a sequence to find the portion of the string that is actually
05336  * an ID, as opposed to organism information or definition line.
05337  */
05338 static char * s_GetIdFromString (char * str)
05339 {
05340     char * cp;
05341     char * id;
05342     int    len;
05343 
05344     if (str == NULL) {
05345         return NULL;
05346     }
05347 
05348     cp = str;
05349     cp += strspn (str, " >\t");
05350     len = strcspn (cp, " \t\r\n");
05351     if (len == 0) {
05352         return NULL;
05353     }
05354     id = (char *)malloc (len + 1);
05355     if (id == NULL) {
05356         return NULL;
05357     }
05358     strncpy (id, cp, len);
05359     id [ len ] = 0;
05360     return id;
05361 }
05362 
05363 
05364 /* This function pulls defline information from the ID string, if there is
05365  * any.
05366  */
05367 static char * s_GetDeflineFromIdString (char * str)
05368 {
05369     char * cp;
05370     int    len;
05371 
05372     if (str == NULL) {
05373         return NULL;
05374     }
05375 
05376     cp = str;
05377     cp += strspn (str, " >\t");
05378     len = strcspn (cp, " \t\r\n");
05379     if (len == 0) {
05380         return NULL; 
05381     }
05382     cp += len;
05383     len = strspn (cp, " \t\r\n");
05384     if (len == 0) {
05385         return NULL; 
05386     }
05387     cp += len;
05388     if (*cp == 0) {
05389         return NULL;
05390     }
05391     return strdup (cp);
05392 }
05393 
05394 
05395 /* This function takes the ID strings read from the file and parses them
05396  * to obtain a defline (if there is extra text after the ID and/or
05397  * organism information) and to obtain the actual ID for the sequence.
05398  */
05399 static EBool s_ReprocessIds (SAlignRawFilePtr afrp)
05400 {
05401     TStringCountPtr list, scp;
05402     TAlignRawSeqPtr arsp;
05403     TLineInfoPtr    lip;
05404     char *          id;
05405     int             line_num;
05406     EBool           rval = eTrue;
05407     char *          defline;
05408 
05409     if (afrp == NULL) {
05410         return eFalse;
05411     }
05412 
05413     list = NULL;
05414     lip = afrp->deflines;
05415     for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
05416         if (arsp->id_lines != NULL) {
05417             line_num = arsp->id_lines->ival;
05418         } else {
05419             line_num = -1;
05420         }
05421         s_RemoveOrganismCommentFromLine (arsp->id);
05422         id = s_GetIdFromString (arsp->id);
05423         if (lip == NULL) {
05424             defline = s_GetDeflineFromIdString (arsp->id);
05425             afrp->deflines = s_AddLineInfo (afrp->deflines, defline,
05426                                             line_num, 0);
05427             free (defline);
05428             afrp->num_deflines ++;
05429         }
05430         free (arsp->id);
05431         arsp->id = id;
05432         list = s_AddStringCount (arsp->id, line_num, list);
05433     }
05434 
05435     for (scp = list;  scp != NULL;  scp = scp->next) {
05436         if (scp->num_appearances > 1) {
05437             rval = eFalse;
05438             s_ReportRepeatedId (scp, afrp->report_error,
05439                               afrp->report_error_userdata);
05440         }
05441     }
05442     /* free string count list */
05443     s_StringCountFree (list);
05444     return rval;
05445 }
05446 
05447 
05448 /* This function reports unacceptable characters in a sequence.  Frequently
05449  * there will be more than one character of the same kind (for instance,
05450  * when the user has incorrectly specified a gap character), so repeated
05451  * characters are reported together.  The function advances the data 
05452  * position in the SLineInfoReader structure lirp, and returns the
05453  * current data position for lirp.
05454  */
05455 static int 
05456 s_ReportRepeatedBadCharsInSequence
05457 (TLineInfoReaderPtr   lirp,
05458  char *               id,
05459  char *               reason,
05460  FReportErrorFunction report_error,
05461  void *              report_error_userdata)
05462 {
05463     int  bad_line_num, bad_line_offset;
05464     int  num_bad_chars;
05465     char bad_char, curr_char;
05466     int  data_position;
05467 
05468     bad_line_num = s_LineInfoReaderGetCurrentLineNumber (lirp);
05469     bad_line_offset = s_LineInfoReaderGetCurrentLineOffset (lirp);
05470     bad_char = *lirp->curr_line_pos;
05471     num_bad_chars = 1;
05472     data_position = lirp->data_pos + 1;
05473     while ((curr_char = s_FindNthDataChar (lirp, data_position)) == bad_char) {
05474         num_bad_chars ++;
05475         data_position ++;
05476     }
05477     s_ReportBadCharError (id, bad_char, num_bad_chars,
05478                         bad_line_offset, bad_line_num, reason,
05479                         report_error, report_error_userdata);
05480     return data_position;
05481 }
05482 
05483 
05484 /* This function does context-sensitive replacement of the missing,
05485  * match, and gap characters and also identifies bad characters.
05486  * Gap characters found in the wrong location in the sequence are
05487  * considered an error.  Characters that are not missing, match, or 
05488  * gap characters and are not in the specified sequence alphabet are
05489  * reported as errors.  Match characters in the first sequence are also
05490  * reported as errors.
05491  * The function will return eTrue if any errors were found, or eFalse
05492  * if there were no errors.
05493  */
05494 static EBool
05495 s_FindBadDataCharsInSequence
05496 (TAlignRawSeqPtr      arsp,
05497  TAlignRawSeqPtr      master_arsp,
05498  TSequenceInfoPtr     sip,
05499  int                  num_segments,
05500  FReportErrorFunction report_error,
05501  void *               report_error_userdata)
05502 {
05503     TLineInfoReaderPtr lirp, master_lirp;
05504     int                data_position;
05505     int                middle_start = 0;
05506     int                middle_end = 0;
05507     char               curr_char, master_char;
05508     EBool              found_middle_start;
05509     EBool              rval = eFalse;
05510     EBool              match_not_in_beginning_gap;
05511     EBool              match_not_in_end_gap;
05512 
05513     if (arsp == NULL  ||  master_arsp == NULL  ||  sip == NULL) {
05514         return eTrue;
05515     }
05516     lirp = s_LineInfoReaderNew (arsp->sequence_data);
05517     if (lirp == NULL) {
05518         return eTrue;
05519     }
05520     if (arsp != master_arsp) {
05521         master_lirp = s_LineInfoReaderNew (master_arsp->sequence_data);
05522         if (master_lirp == NULL) {
05523             s_LineInfoReaderFree (lirp);
05524             return eTrue;
05525         }
05526     } else {
05527         master_lirp = NULL;
05528     }
05529 
05530     if (strcspn (sip->beginning_gap, sip->match) 
05531                   == strlen (sip->beginning_gap)) {
05532         match_not_in_beginning_gap = eTrue;
05533     } else {
05534         match_not_in_beginning_gap = eFalse;
05535     }
05536 
05537     if (strcspn (sip->end_gap, sip->match) == strlen (sip->end_gap)) {
05538         match_not_in_end_gap = eTrue;
05539     } else {
05540         match_not_in_end_gap = eFalse;
05541     }
05542 
05543     /* First, find middle start and end positions and report characters
05544      * that are not beginning gap before the middle
05545      */
05546     found_middle_start = eFalse;
05547     data_position = 0;
05548     curr_char = s_FindNthDataChar (lirp, data_position);
05549     while (curr_char != 0) {
05550         if (strchr (sip->alphabet, curr_char) != NULL) {
05551             if (! found_middle_start) {
05552                 middle_start = data_position;
05553                 found_middle_start = eTrue;
05554             }
05555             middle_end = data_position + 1;
05556             data_position ++;
05557         } else if (! found_middle_start) {
05558             if (match_not_in_beginning_gap
05559                 &&  strchr (sip->match, curr_char) != NULL)
05560             {
05561                 middle_start = data_position;
05562                 found_middle_start = eTrue;
05563                 middle_end = data_position + 1;
05564                 data_position ++;
05565             } else if (strchr (sip->beginning_gap, curr_char) == NULL) {
05566                 /* Report error - found character that is not beginning gap
05567                    in beginning gap */
05568                 data_position = s_ReportRepeatedBadCharsInSequence (lirp,
05569                                                                   arsp->id,
05570                                 "expect only beginning gap characters here",
05571                                 report_error, report_error_userdata);
05572                 rval = eTrue;
05573             } else {
05574                 *lirp->curr_line_pos = '-';
05575                 data_position ++;
05576             }
05577         } else {
05578             if (match_not_in_end_gap
05579                 &&  strchr (sip->match, curr_char) != NULL)
05580             {
05581                 middle_end = data_position + 1;
05582             }
05583             data_position ++;
05584         }
05585         curr_char = s_FindNthDataChar (lirp, data_position);
05586     }
05587 
05588     if (! found_middle_start) {
05589         if (num_segments > 1)
05590         {
05591             return eFalse;
05592         }
05593         else
05594         {
05595             s_ReportMissingSequenceData (arsp->id,
05596                                    report_error, report_error_userdata);
05597             s_LineInfoReaderFree (lirp);
05598             return eTrue;
05599           
05600         }
05601     }
05602 
05603     /* Now complain about bad middle characters */
05604     data_position = middle_start;
05605     while (data_position < middle_end)
05606     {
05607         curr_char = s_FindNthDataChar (lirp, data_position);
05608         while (data_position < middle_end
05609                &&  strchr (sip->alphabet, curr_char) != NULL) {
05610             data_position ++;
05611             curr_char = s_FindNthDataChar (lirp, data_position);
05612         } 
05613         if (curr_char == 0  ||  data_position >= middle_end) {
05614             /* do nothing, done with middle */
05615         } else if (strchr (sip->missing, curr_char) != NULL) {
05616             *lirp->curr_line_pos = 'N';
05617             data_position ++;
05618         } else if (strchr (sip->match, curr_char) != NULL) {
05619             master_char = s_FindNthDataChar (master_lirp, data_position);
05620             if (master_char == 0) {
05621                 /* report error - unable to get master char */
05622                 if (master_arsp == arsp) {
05623                     data_position = s_ReportRepeatedBadCharsInSequence (lirp,
05624                                 arsp->id,
05625                                 "can't specify match chars in first sequence",
05626                                 report_error, report_error_userdata);
05627                 } else {
05628                     data_position = s_ReportRepeatedBadCharsInSequence (lirp,
05629                                 arsp->id,
05630                                 "can't find source for match chars",
05631                                 report_error, report_error_userdata);
05632                 }
05633                 rval = eTrue;
05634             } else {
05635                 *lirp->curr_line_pos = master_char;
05636                 data_position ++;
05637             }
05638         } else if (strchr (sip->middle_gap, curr_char) != NULL) {
05639             *lirp->curr_line_pos = '-';
05640             data_position ++;
05641         } else {
05642             /* Report error - found bad character in middle */
05643             data_position = s_ReportRepeatedBadCharsInSequence (lirp,
05644                                       arsp->id,
05645                                       "expect only sequence, missing, match,"
05646                                       " and middle gap characters here",
05647                                       report_error, report_error_userdata);
05648             rval = eTrue;
05649         }
05650     }
05651 
05652     /* Now find and complain about end characters */
05653     data_position = middle_end;
05654     curr_char = s_FindNthDataChar (lirp, data_position);
05655     while (curr_char != 0) {
05656         if (strchr (sip->end_gap, curr_char) == NULL) {
05657             /* Report error - found bad character in middle */
05658             data_position = s_ReportRepeatedBadCharsInSequence (lirp, arsp->id,
05659                                       "expect only end gap characters here",
05660                                       report_error, report_error_userdata);
05661             rval = eTrue;
05662         } else {
05663             *lirp->curr_line_pos = '-';
05664             data_position++;
05665         }
05666         curr_char = s_FindNthDataChar (lirp, data_position);
05667     }
05668     s_LineInfoReaderFree (lirp);
05669     s_LineInfoReaderFree (master_lirp);
05670     return rval;
05671 }
05672 
05673 
05674 /* This function examines each sequence and replaces the special characters
05675  * and reports bad characters in each one.  The function will return eTrue
05676  * if any of the sequences contained bad characters or eFalse if no errors
05677  * were seen.
05678  */
05679 static EBool
05680 s_s_FindBadDataCharsInSequenceList
05681 (SAlignRawFilePtr afrp,
05682  TSequenceInfoPtr sip)
05683 {
05684     TAlignRawSeqPtr arsp;
05685     EBool           rval = eFalse;
05686 
05687     if (afrp == NULL  ||  afrp->sequences == NULL) {
05688         return eTrue;
05689     }
05690     for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
05691         if (s_FindBadDataCharsInSequence (arsp, afrp->sequences, sip,
05692                                         afrp->num_segments,
05693                                         afrp->report_error,
05694                                         afrp->report_error_userdata)) {
05695             rval = eTrue;
05696         }
05697     }
05698     return rval;
05699 }
05700 
05701 
05702 /* This function examines the organisms listed for the alignment and determines
05703  * whether any of the organism names (including the associated comments) are
05704  * repeated.
05705  */
05706 static EBool s_AreOrganismsUnique (SAlignRawFilePtr afrp)
05707 {
05708     TLineInfoPtr    this_org, lip;
05709     TAlignRawSeqPtr arsp;
05710     EBool           are_unique;
05711 
05712     if (afrp == NULL  ||  afrp->num_organisms == 0
05713         ||  afrp->organisms == NULL) {
05714         return eFalse;
05715     }
05716     are_unique = eTrue;
05717     for (this_org = afrp->organisms;
05718          this_org != NULL;
05719          this_org = this_org->next) {
05720         lip = afrp->organisms;
05721         arsp = afrp->sequences;
05722         while (lip != NULL  &&  lip != this_org
05723                &&  strcmp (lip->data, this_org->data) != 0  &&  arsp != NULL) {
05724             lip = lip->next;
05725             arsp = arsp->next;
05726         }
05727         if (lip != NULL  &&  lip != this_org) {
05728             are_unique = eFalse;
05729             s_ReportRepeatedOrganismName (arsp->id, this_org->line_num,
05730                                         lip->line_num,
05731                                         this_org->data,
05732                                         afrp->report_error,
05733                                         afrp->report_error_userdata);
05734         }
05735     }
05736     return are_unique;
05737 }
05738 
05739 
05740 #if 0 /* this step was removed by indexer request */
05741 /* This function reports whether the definition lines are identical for
05742  * each sequence or not.
05743  */
05744 static EBool s_AreDeflinesIdentical (SAlignRawFilePtr afrp)
05745 {
05746     TLineInfoPtr    lip;
05747     TStringCountPtr list;
05748     EBool           rval;
05749 
05750     if (afrp == NULL) {
05751         return eFalse;
05752     }
05753 
05754     list = NULL;
05755     for (lip = afrp->deflines;  lip != NULL;  lip = lip->next) {
05756         list = s_AddStringCount (lip->data, lip->line_num, list);
05757     }
05758     rval = eTrue;
05759     if (list != NULL  &&  list->next != NULL) {
05760         rval = eFalse; 
05761         s_ReportDefinitionLineMismatch (afrp->report_error,
05762                                       afrp->report_error_userdata);
05763         s_ReportDefinitionLines (list, afrp->report_error,
05764                                afrp->report_error_userdata);
05765     }
05766     s_StringCountFree (list);
05767     return rval;
05768 }
05769 #endif
05770 
05771 
05772 /* This function uses the contents of an SAlignRawFileData structure to
05773  * create an SAlignmentFile structure with the appropriate information.
05774  */
05775 static TAlignmentFilePtr
05776 s_ConvertDataToOutput 
05777 (SAlignRawFilePtr afrp,
05778  TSequenceInfoPtr sip)
05779 {
05780     TAlignRawSeqPtr   arsp;
05781     int               index;
05782     TSizeInfoPtr    * lengths;
05783     int             * best_length;
05784     TAlignmentFilePtr afp;
05785     TLineInfoPtr      lip;
05786     int               curr_seg;
05787 
05788     if (afrp == NULL  ||  sip == NULL  ||  afrp->sequences == NULL) {
05789         return NULL;
05790     }
05791     afp = AlignmentFileNew ();
05792     if (afp == NULL) {
05793         return NULL;
05794     }
05795 
05796     afp->num_organisms = afrp->num_organisms;
05797     afp->num_deflines = afrp->num_deflines;
05798     afp->num_segments = afrp->num_segments;
05799     afp->num_sequences = 0;
05800     afp->align_format_found = afrp->align_format_found;
05801     lengths = NULL;
05802 
05803     for (arsp = afrp->sequences;  arsp != NULL;  arsp = arsp->next) {
05804         afp->num_sequences++;
05805     }
05806 
05807     if (afp->num_sequences != afrp->num_organisms
05808         && afp->num_sequences / afp->num_segments != afrp->num_organisms) {
05809         s_ReportMissingOrganismInfo (afrp->report_error,
05810                                    afrp->report_error_userdata);
05811     } else {
05812         s_AreOrganismsUnique (afrp);
05813     }
05814 
05815     afp->sequences = (char **)malloc (afp->num_sequences 
05816                                            * sizeof (char *));
05817     if (afp->sequences == NULL) {
05818         AlignmentFileFree (afp);
05819         return NULL;
05820     }
05821     afp->ids = (char **)malloc (afp->num_sequences * sizeof (char *));
05822     if (afp->ids == NULL) {
05823         AlignmentFileFree (afp);
05824         return NULL;
05825     }
05826     if (afp->num_organisms > 0) {
05827         afp->organisms = (char **) malloc (afp->num_organisms
05828                                                 * sizeof (char *));
05829         if (afp->organisms == NULL) {
05830             AlignmentFileFree (afp);
05831             return NULL;
05832         }
05833     }
05834     if (afp->num_deflines > 0) {
05835         afp->deflines = (char **)malloc (afp->num_deflines
05836                                               * sizeof (char *));
05837         if (afp->deflines == NULL) {
05838             AlignmentFileFree (afp);
05839             return NULL;
05840         }
05841     }
05842 
05843     /* copy in deflines */
05844     for (lip = afrp->deflines, index = 0;
05845          lip != NULL  &&  index < afp->num_deflines;
05846          lip = lip->next, index++) {
05847         if (lip->data == NULL) {
05848             afp->deflines [index] = NULL;
05849         } else {
05850             afp->deflines [index] = strdup (lip->data);
05851         }
05852     }
05853     while (index < afp->num_deflines) {
05854         afp->deflines [index ++] = NULL;
05855     }
05856 
05857     /* copy in organism information */
05858     for (lip = afrp->organisms, index = 0;
05859          lip != NULL  &&  index < afp->num_organisms;
05860          lip = lip->next, index++) {
05861         afp->organisms [index] = strdup (lip->data);
05862     }
05863   
05864     /* we need to store length information about different segments separately */
05865     lengths = (TSizeInfoPtr *) malloc (sizeof (TSizeInfoPtr) * afrp->num_segments);
05866     if (lengths == NULL) {
05867         AlignmentFileFree (afp);
05868         return NULL;
05869     }
05870     best_length = (int *) malloc (sizeof (int) * afrp->num_segments);
05871     if (best_length == NULL) {
05872         free (lengths);
05873         AlignmentFileFree (afp);
05874         return NULL;
05875     }
05876     for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++) {
05877         lengths [curr_seg] = NULL;
05878         best_length [curr_seg] = 0;
05879     }
05880     
05881     /* copy in sequence data */
05882     curr_seg = 0;
05883     for (arsp = afrp->sequences, index = 0;
05884          arsp != NULL  &&  index < afp->num_sequences;
05885          arsp = arsp->next, index++) {
05886         afp->sequences [index] = 
05887                     s_LineInfoMergeAndStripSpaces (arsp->sequence_data);
05888 
05889         if (afp->sequences [index] != NULL) {
05890             lengths [curr_seg] = s_AddSizeInfo (lengths [curr_seg], strlen (afp->sequences [index]));
05891         }
05892         afp->ids [index] = strdup (arsp->id);
05893         curr_seg ++;
05894         if (curr_seg >= afrp->num_segments) {
05895             curr_seg = 0;
05896         }
05897     }
05898     for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
05899     {
05900         best_length [curr_seg] = s_GetMostPopularSize (lengths [curr_seg]);
05901         if (best_length [curr_seg] == 0  &&  lengths [curr_seg] != NULL) {
05902             best_length [curr_seg] = lengths [curr_seg]->size_value;
05903         }   
05904     }
05905 
05906     curr_seg = 0;
05907     for (index = 0;  index < afp->num_sequences;  index++) {
05908         if (afp->sequences [index] == NULL) {
05909             s_ReportMissingSequenceData (afp->ids [index],
05910                                        afrp->report_error,
05911                                        afrp->report_error_userdata);
05912         } else if ((int) strlen (afp->sequences [index]) != best_length [curr_seg]) {
05913             s_ReportBadSequenceLength (afp->ids [index], best_length [curr_seg],
05914                                      strlen (afp->sequences [index]),
05915                                      afrp->report_error,
05916                                      afrp->report_error_userdata);
05917         }
05918         curr_seg ++;
05919         if (curr_seg >= afrp->num_segments) {
05920             curr_seg = 0;
05921         }
05922     }
05923 
05924     if (afrp->expected_num_sequence > 0
05925       &&  afrp->expected_num_sequence != afp->num_sequences)
05926     {
05927         s_ReportIncorrectNumberOfSequences (afrp->expected_num_sequence,
05928                                           afp->num_sequences,
05929                                           afrp->report_error,
05930                                           afrp->report_error_userdata);
05931     }
05932     if (afrp->expected_sequence_len > 0
05933       &&  afrp->expected_sequence_len != best_length [0])
05934     {
05935         s_ReportIncorrectSequenceLength (afrp->expected_sequence_len,
05936                                        best_length [0],
05937                                        afrp->report_error,
05938                                        afrp->report_error_userdata);
05939     }
05940     
05941     free (best_length);
05942     for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
05943     {
05944         s_SizeInfoFree (lengths [curr_seg]);
05945     }
05946     free (lengths);
05947     
05948     return afp;
05949 }
05950 
05951 
05952 /* This is the function called by the calling program to read an alignment
05953  * file.  The readfunc argument is a function pointer supplied by the 
05954  * calling program which this library will use to read in data from the
05955  * file one line at a time.  The fileuserdata argument is a pointer to 
05956  * data used by the calling program's readfunc function and will be passed
05957  * back with each call to readfunc.
05958  * The errfunc argument is a function pointer supplied by the calling
05959  * program for reporting errors.  The erroruserdata argument is a pointer
05960  * to data used by the calling program's errfunc function and will be
05961  * passed back with each call to readfunc.
05962  * The sequence_info argument contains the sequence alphabet and missing,
05963  * match, and gap characters to use in interpreting the sequence data.
05964  */
05965 extern TAlignmentFilePtr 
05966 ReadAlignmentFileEx 
05967 (FReadLineFunction readfunc,
05968  void * fileuserdata,
05969  FReportErrorFunction errfunc,
05970  void * erroruserdata,
05971  TSequenceInfoPtr sequence_info,
05972  int              use_nexus_file_info)
05973 {
05974     SAlignRawFilePtr afrp;
05975     TAlignmentFilePtr afp;
05976     EBool             use_file = eFalse;
05977 
05978     if (sequence_info == NULL  ||  sequence_info->alphabet == NULL) {
05979         return NULL;
05980     }
05981     
05982     if (use_nexus_file_info != 0)
05983     {
05984       use_file = eTrue;
05985     }
05986     
05987     afrp = s_ReadAlignFileRaw ( readfunc, fileuserdata, sequence_info,
05988                                 use_file,
05989                                 errfunc, erroruserdata);
05990     if (afrp == NULL) {
05991         return NULL;
05992     }
05993 
05994     if (afrp->block_size > 1) {
05995         s_ProcessAlignRawFileByBlockOffsets (afrp);
05996     } else if (afrp->marked_ids) {
05997         s_ProcessAlignFileRawForMarkedIDs (afrp);
05998     } else {
05999         s_ProcessAlignFileRawByLengthPattern (afrp);
06000     }
06001 
06002     s_ReprocessIds (afrp);
06003 
06004 #if 0 /* this step was removed by indexer request */
06005     /* Note - have to check deflines after reprocessing IDs */
06006     s_AreDeflinesIdentical (afrp);
06007 #endif
06008 
06009     if (s_s_FindBadDataCharsInSequenceList (afrp, sequence_info)) {
06010         s_AlignFileRawFree (afrp);
06011         return NULL;
06012     }
06013 
06014     afp = s_ConvertDataToOutput (afrp, sequence_info);
06015     s_AlignFileRawFree (afrp);
06016   
06017     return afp;
06018 }
06019 
06020 
06021 extern TAlignmentFilePtr 
06022 ReadAlignmentFile 
06023 (FReadLineFunction readfunc,
06024  void * fileuserdata,
06025  FReportErrorFunction errfunc,
06026  void * erroruserdata,
06027  TSequenceInfoPtr sequence_info)
06028 {
06029     return ReadAlignmentFileEx (readfunc, fileuserdata, errfunc, erroruserdata,
06030                                 sequence_info, eFalse);
06031 }
06032 
06033 

Generated on Wed Dec 9 05:26:07 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Wed Dec 09 08:18:15 2009 by modify_doxy.py rev. 173732