00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include <util/creaders/alnread.h>
00033 #include <stdio.h>
00034 #include <stdlib.h>
00035 #include <string.h>
00036 #include <ctype.h>
00037
00038 static const int kMaxPrintedIntLen = 10;
00039 #define MAX_PRINTED_INT_LEN_PLUS_ONE 11
00040
00041 typedef enum {
00042 eTrue = -1,
00043 eFalse = 0
00044 } EBool;
00045
00046
00047 typedef struct SLineInfo {
00048 char * data;
00049 int line_num;
00050 int line_offset;
00051 EBool delete_me;
00052 struct SLineInfo * next;
00053 } SLineInfo, * TLineInfoPtr;
00054
00055 typedef struct SLineInfoReader {
00056 TLineInfoPtr first_line;
00057 TLineInfoPtr curr_line;
00058 char * curr_line_pos;
00059 int data_pos;
00060 } SLineInfoReader, * TLineInfoReaderPtr;
00061
00062 typedef struct SIntLink {
00063 int ival;
00064 struct SIntLink * next;
00065 } SIntLink, * TIntLinkPtr;
00066
00067 typedef struct SStringCount {
00068 char * string;
00069 int num_appearances;
00070 TIntLinkPtr line_numbers;
00071 struct SStringCount * next;
00072 } SStringCount, * TStringCountPtr;
00073
00074 typedef struct SSizeInfo {
00075 int size_value;
00076 int num_appearances;
00077 struct SSizeInfo * next;
00078 } SSizeInfo, * TSizeInfoPtr;
00079
00080 typedef struct SLengthList {
00081 TSizeInfoPtr lengthrepeats;
00082 int num_appearances;
00083 struct SLengthList * next;
00084 } SLengthListData, * SLengthListPtr;
00085
00086 typedef struct SCommentLoc {
00087 char * start;
00088 char * end;
00089 struct SCommentLoc * next;
00090 } SCommentLoc, * TCommentLocPtr;
00091
00092 typedef struct SBracketedCommentList
00093 {
00094 TLineInfoPtr comment_lines;
00095 struct SBracketedCommentList * next;
00096 } SBracketedCommentList, * TBracketedCommentListPtr;
00097
00098 typedef struct SAlignRawSeq {
00099 char * id;
00100 TLineInfoPtr sequence_data;
00101 TIntLinkPtr id_lines;
00102 struct SAlignRawSeq * next;
00103 } SAlignRawSeq, * TAlignRawSeqPtr;
00104
00105 typedef struct SAlignFileRaw {
00106 TLineInfoPtr line_list;
00107 TLineInfoPtr organisms;
00108 TAlignRawSeqPtr sequences;
00109 int num_organisms;
00110 TLineInfoPtr deflines;
00111 int num_deflines;
00112 EBool marked_ids;
00113 int block_size;
00114 TIntLinkPtr offset_list;
00115 FReportErrorFunction report_error;
00116 void * report_error_userdata;
00117 char * alphabet;
00118 int expected_num_sequence;
00119 int expected_sequence_len;
00120 int num_segments;
00121 char align_format_found;
00122 } SAlignRawFileData, * SAlignRawFilePtr;
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133 extern TErrorInfoPtr ErrorInfoNew (TErrorInfoPtr list)
00134 {
00135 TErrorInfoPtr eip, last;
00136
00137 eip = (TErrorInfoPtr) malloc ( sizeof (SErrorInfo));
00138 if (eip == NULL) {
00139 return NULL;
00140 }
00141 eip->category = eAlnErr_Unknown;
00142 eip->line_num = -1;
00143 eip->id = NULL;
00144 eip->message = NULL;
00145 eip->next = NULL;
00146 last = list;
00147 while (last != NULL && last->next != NULL) {
00148 last = last->next;
00149 }
00150 if (last != NULL) {
00151 last->next = eip;
00152 }
00153 return eip;
00154 }
00155
00156
00157
00158
00159
00160 extern void ErrorInfoFree (TErrorInfoPtr eip)
00161 {
00162 if (eip == NULL) {
00163 return;
00164 }
00165 ErrorInfoFree (eip->next);
00166 free (eip->id);
00167 free (eip->message);
00168 free (eip);
00169 }
00170
00171
00172
00173
00174 static void
00175 s_ReportCharCommentError
00176 (char * expected,
00177 char seen,
00178 char * val_name,
00179 FReportErrorFunction errfunc,
00180 void * errdata)
00181 {
00182 TErrorInfoPtr eip;
00183 const char * errformat = "Specified %s character does not match NEXUS"
00184 " comment in file (specified %s, comment %c)";
00185
00186 if (errfunc == NULL || val_name == NULL || expected == NULL) {
00187 return;
00188 }
00189
00190 eip = ErrorInfoNew (NULL);
00191 if (eip != NULL) {
00192 eip->category = eAlnErr_BadFormat;
00193 eip->message = (char *) malloc (strlen (errformat) + strlen (val_name)
00194 + strlen (expected) + 2);
00195 if (eip->message != NULL) {
00196 sprintf (eip->message, errformat, val_name, expected, seen);
00197 }
00198 errfunc (eip, errdata);
00199 }
00200 }
00201
00202
00203
00204
00205
00206 static void
00207 s_ReportBadCharError
00208 (char * id,
00209 char bad_char,
00210 int num_bad,
00211 int offset,
00212 int line_number,
00213 char * reason,
00214 FReportErrorFunction errfunc,
00215 void * errdata)
00216 {
00217 TErrorInfoPtr eip;
00218 const char * err_format =
00219 "%d bad characters (%c) found at position %d (%s).";
00220
00221 if (errfunc == NULL || num_bad == 0 || bad_char == 0
00222 || reason == NULL) {
00223 return;
00224 }
00225
00226 eip = ErrorInfoNew (NULL);
00227 if (eip == NULL) {
00228 return;
00229 }
00230
00231 eip->category = eAlnErr_BadData;
00232 if (id != NULL) eip->id = strdup (id);
00233 eip->line_num = line_number;
00234 eip->message = (char *) malloc (strlen (err_format) + 2 * kMaxPrintedIntLen
00235 + strlen (reason) + 3);
00236 if (eip->message != NULL)
00237 {
00238 sprintf (eip->message, err_format, num_bad, bad_char, offset, reason);
00239 }
00240 errfunc (eip, errdata);
00241 }
00242
00243
00244
00245
00246
00247 static void
00248 s_ReportInconsistentID
00249 (char * id,
00250 int line_number,
00251 FReportErrorFunction report_error,
00252 void * report_error_userdata)
00253 {
00254 TErrorInfoPtr eip;
00255
00256 if (report_error == NULL) {
00257 return;
00258 }
00259 eip = ErrorInfoNew (NULL);
00260 if (eip == NULL) {
00261 return;
00262 }
00263 eip->category = eAlnErr_BadFormat;
00264 eip->id = strdup (id);
00265 eip->line_num = line_number;
00266 eip->message = strdup ("Found unexpected ID");
00267 report_error (eip, report_error_userdata);
00268 }
00269
00270
00271
00272
00273
00274 static void
00275 s_ReportInconsistentBlockLine
00276 (char * id,
00277 int line_number,
00278 FReportErrorFunction report_error,
00279 void * report_error_userdata)
00280 {
00281 TErrorInfoPtr eip;
00282
00283 if (report_error == NULL) {
00284 return;
00285 }
00286 eip = ErrorInfoNew (NULL);
00287 if (eip == NULL) {
00288 return;
00289 }
00290 eip->category = eAlnErr_BadFormat;
00291 eip->id = strdup (id);
00292 eip->line_num = line_number;
00293 eip->message = strdup ("Inconsistent block line formatting");
00294 report_error (eip, report_error_userdata);
00295 }
00296
00297
00298 #if 0
00299
00300
00301
00302
00303 static void
00304 s_ReportDefinitionLineMismatch
00305 (FReportErrorFunction report_error,
00306 void * report_error_userdata)
00307 {
00308 TErrorInfoPtr eip;
00309
00310 if (report_error == NULL) {
00311 return;
00312 }
00313 eip = ErrorInfoNew (NULL);
00314 if (eip == NULL) {
00315 return;
00316 }
00317
00318 eip->category = eAlnErr_BadData;
00319 eip->message = strdup ("Mismatched definition lines");
00320 report_error (eip, report_error_userdata);
00321 }
00322 #endif
00323
00324
00325
00326
00327
00328 static void
00329 s_ReportDefinitionLines
00330 (TStringCountPtr list,
00331 FReportErrorFunction report_error,
00332 void * report_error_userdata)
00333 {
00334 TErrorInfoPtr eip;
00335 const char * err_null_format = "Null definition line occurs %d times";
00336 const char * err_format = "Definition line %s occurs %d times";
00337
00338 if (list == NULL || report_error == NULL) {
00339 return;
00340 }
00341 eip = ErrorInfoNew (NULL);
00342 if (eip == NULL) {
00343 return;
00344 }
00345
00346 eip->category = eAlnErr_BadData;
00347 if (list->string == NULL) {
00348 eip->message = (char*)malloc (strlen (err_null_format)
00349 + kMaxPrintedIntLen + 1);
00350 if (eip->message != NULL) {
00351 sprintf (eip->message, err_null_format, list->num_appearances);
00352 }
00353 } else {
00354 eip->message = (char*)malloc (strlen (err_format)
00355 + strlen (list->string)
00356 + kMaxPrintedIntLen + 1);
00357 if (eip->message != NULL) {
00358 sprintf (eip->message, err_format, list->string,
00359 list->num_appearances);
00360 }
00361 }
00362 report_error (eip, report_error_userdata);
00363
00364 s_ReportDefinitionLines (list->next, report_error, report_error_userdata);
00365 }
00366
00367
00368
00369
00370
00371 static void
00372 s_ReportLineLengthError
00373 (char * id,
00374 TLineInfoPtr lip,
00375 int expected_length,
00376 FReportErrorFunction report_error,
00377 void * report_error_userdata)
00378 {
00379 TErrorInfoPtr eip;
00380 char * msg;
00381 const char * format = "Expected line length %d, actual length %d";
00382 int len;
00383
00384 if (lip == NULL || report_error == NULL) {
00385 return;
00386 }
00387
00388 eip = ErrorInfoNew (NULL);
00389 if (eip == NULL) {
00390 return;
00391 }
00392 eip->category = eAlnErr_BadFormat;
00393 eip->id = strdup (id);
00394 eip->line_num = lip->line_num;
00395 msg = (char *) malloc (strlen (format) + kMaxPrintedIntLen + 1);
00396 if (msg != NULL) {
00397 if (lip->data == NULL) {
00398 len = 0;
00399 } else {
00400 len = strlen (lip->data);
00401 }
00402 sprintf (msg, format, expected_length, len);
00403 eip->message = msg;
00404 }
00405 report_error (eip, report_error_userdata);
00406 }
00407
00408
00409
00410
00411
00412 static void
00413 s_ReportBlockLengthError
00414 (char * id,
00415 int line_num,
00416 int expected_num,
00417 int actual_num,
00418 FReportErrorFunction report_error,
00419 void * report_error_userdata)
00420 {
00421 TErrorInfoPtr eip;
00422 const char * err_format = "Expected %d lines in block, found %d";
00423
00424 if (report_error == NULL) {
00425 return;
00426 }
00427
00428 eip = ErrorInfoNew (NULL);
00429 if (eip == NULL) {
00430 return;
00431 }
00432 eip->category = eAlnErr_BadFormat;
00433 eip->id = strdup (id);
00434 eip->line_num = line_num;
00435 eip->message = (char *)malloc (strlen (err_format) + 2 * kMaxPrintedIntLen + 1);
00436 if (eip->message != NULL) {
00437 sprintf (eip->message, err_format, expected_num, actual_num);
00438 }
00439 report_error (eip, report_error_userdata);
00440 }
00441
00442
00443
00444
00445
00446 static void
00447 s_ReportDuplicateIDError
00448 (char * id,
00449 int line_num,
00450 FReportErrorFunction report_error,
00451 void * report_error_userdata)
00452 {
00453 TErrorInfoPtr eip;
00454 const char * err_format = "Duplicate ID! Sequences will be concatenated!";
00455
00456 if (report_error == NULL) {
00457 return;
00458 }
00459
00460 eip = ErrorInfoNew (NULL);
00461 if (eip == NULL) {
00462 return;
00463 }
00464 eip->category = eAlnErr_BadData;
00465 eip->id = strdup (id);
00466 eip->line_num = line_num;
00467 eip->message = strdup (err_format);
00468 report_error (eip, report_error_userdata);
00469 }
00470
00471
00472
00473
00474
00475 static void
00476 s_ReportMissingSequenceData
00477 (char * id,
00478 FReportErrorFunction report_error,
00479 void * report_error_userdata)
00480 {
00481 TErrorInfoPtr eip;
00482
00483 if (report_error == NULL) {
00484 return;
00485 }
00486 eip = ErrorInfoNew (NULL);
00487 if (eip == NULL) {
00488 return;
00489 }
00490 eip->category = eAlnErr_Fatal;
00491 eip->id = strdup (id);
00492 eip->message = strdup ("No data found");
00493 report_error (eip, report_error_userdata);
00494 }
00495
00496
00497
00498
00499
00500
00501 static void
00502 s_ReportBadSequenceLength
00503 (char * id,
00504 int expected_length,
00505 int actual_length,
00506 FReportErrorFunction report_error,
00507 void * report_error_userdata)
00508 {
00509 TErrorInfoPtr eip;
00510 const char * format_str = "Expected sequence length %d, actual length %d";
00511
00512 if (report_error == NULL) {
00513 return;
00514 }
00515 eip = ErrorInfoNew (NULL);
00516 if (eip == NULL) {
00517 return;
00518 }
00519 eip->category = eAlnErr_BadFormat;
00520 eip->id = strdup (id);
00521 eip->message = (char *)malloc (strlen (format_str) + 50);
00522 if (eip->message != NULL) {
00523 sprintf (eip->message, format_str, expected_length, actual_length);
00524 }
00525 report_error (eip, report_error_userdata);
00526 }
00527
00528
00529
00530
00531
00532 static void
00533 s_ReportIncorrectNumberOfSequences
00534 (int num_expected,
00535 int num_found,
00536 FReportErrorFunction report_error,
00537 void * report_error_userdata)
00538 {
00539 TErrorInfoPtr eip;
00540 const char * err_format = "Expected %d sequences, found %d";
00541
00542 if (report_error == NULL) {
00543 return;
00544 }
00545 eip = ErrorInfoNew (NULL);
00546 if (eip == NULL) {
00547 return;
00548 }
00549 eip->category = eAlnErr_BadFormat;
00550 eip->message = (char *) malloc (strlen (err_format) +
00551 2 * kMaxPrintedIntLen + 1);
00552
00553 if (eip->message != NULL)
00554 {
00555 sprintf (eip->message, err_format, num_expected, num_found);
00556 }
00557 report_error (eip, report_error_userdata);
00558 }
00559
00560
00561 static void
00562 s_ReportIncorrectSequenceLength
00563 (int len_expected,
00564 int len_found,
00565 FReportErrorFunction report_error,
00566 void * report_error_userdata)
00567 {
00568 TErrorInfoPtr eip;
00569 const char * err_format = "Expected sequences of length %d, found %d";
00570
00571 if (report_error == NULL) {
00572 return;
00573 }
00574 eip = ErrorInfoNew (NULL);
00575 if (eip == NULL) {
00576 return;
00577 }
00578
00579 eip->category = eAlnErr_BadFormat;
00580 eip->message = (char *)malloc (strlen (err_format)
00581 + 2 * kMaxPrintedIntLen + 1);
00582 if (eip->message != NULL)
00583 {
00584 sprintf (eip->message, err_format, len_expected, len_found);
00585 }
00586 report_error (eip, report_error_userdata);
00587 }
00588
00589
00590
00591
00592
00593 static void
00594 s_ReportRepeatedOrganismName
00595 (char * id,
00596 int line_num,
00597 int second_line_num,
00598 char * org_name,
00599 FReportErrorFunction report_error,
00600 void * report_error_userdata)
00601 {
00602 TErrorInfoPtr eip;
00603 const char * err_format = "Organism name %s also appears at line %d";
00604
00605 if (report_error == NULL || org_name == NULL) {
00606 return;
00607 }
00608 eip = ErrorInfoNew (NULL);
00609 if (eip == NULL) {
00610 return;
00611 }
00612 eip->category = eAlnErr_BadData;
00613 eip->line_num = line_num;
00614 if (id != NULL ) {
00615 eip->id = strdup (id);
00616 }
00617 eip->message = (char *)malloc (strlen (err_format) + strlen (org_name)
00618 + kMaxPrintedIntLen + 1);
00619 if (eip->message != NULL) {
00620 sprintf (eip->message, err_format, org_name, second_line_num);
00621 }
00622 report_error (eip, report_error_userdata);
00623 }
00624
00625
00626
00627
00628
00629 static void
00630 s_ReportMissingOrganismInfo
00631 (FReportErrorFunction report_error,
00632 void * report_error_userdata)
00633 {
00634 TErrorInfoPtr eip;
00635
00636 if (report_error == NULL) {
00637 return;
00638 }
00639 eip = ErrorInfoNew (NULL);
00640 if (eip == NULL) {
00641 return;
00642 }
00643
00644 eip->category = eAlnErr_BadData;
00645 eip->message = strdup ("Missing organism information");
00646 report_error (eip, report_error_userdata);
00647 }
00648
00649
00650
00651
00652
00653 static void
00654 s_ReportRepeatedId
00655 (TStringCountPtr scp,
00656 FReportErrorFunction report_error,
00657 void * report_error_userdata)
00658 {
00659 TErrorInfoPtr eip;
00660 const char * err_format = "ID %s appears in the following locations:";
00661 char * cp;
00662 TIntLinkPtr line_number;
00663
00664 if (report_error == NULL || scp == NULL || scp->string == NULL) {
00665 return;
00666 }
00667
00668 eip = ErrorInfoNew (NULL);
00669 if (eip == NULL) {
00670 return;
00671 }
00672
00673 eip->category = eAlnErr_BadData;
00674 eip->id = strdup (scp->string);
00675 if (scp->line_numbers != NULL) {
00676 eip->line_num = scp->line_numbers->ival;
00677 }
00678 eip->message = (char *) malloc ( strlen (err_format)
00679 + strlen (scp->string)
00680 + scp->num_appearances * 15
00681 + 1);
00682 if (eip->message != NULL) {
00683 sprintf (eip->message, err_format, scp->string);
00684 cp = eip->message + strlen (eip->message);
00685 for (line_number = scp->line_numbers;
00686 line_number != NULL;
00687 line_number = line_number->next) {
00688 sprintf (cp, " %d", line_number->ival);
00689 cp += strlen (cp);
00690 }
00691 }
00692 report_error (eip, report_error_userdata);
00693 }
00694
00695
00696
00697
00698
00699 static void
00700 s_ReportASN1Error
00701 (FReportErrorFunction errfunc,
00702 void * errdata)
00703 {
00704 TErrorInfoPtr eip;
00705 const char * msg = "This is an ASN.1 file, "
00706 "which cannot be read by this function.";
00707
00708 if (errfunc == NULL) {
00709 return;
00710 }
00711
00712 eip = ErrorInfoNew (NULL);
00713 if (eip != NULL) {
00714 eip->category = eAlnErr_BadData;
00715 eip->message = (char *) malloc (strlen (msg) + 1);
00716 if (eip->message != NULL) {
00717 sprintf (eip->message, "%s", msg);
00718 }
00719 errfunc (eip, errdata);
00720 }
00721 }
00722
00723
00724
00725
00726
00727 static void
00728 s_ReportSegmentedAlignmentError
00729 (TIntLinkPtr offset_list,
00730 FReportErrorFunction errfunc,
00731 void * errdata)
00732 {
00733 TErrorInfoPtr eip;
00734 const char * msg = "This file contains sequences in brackets (indicating "
00735 "a segmented alignment) as well as sequences not in brackets at lines "
00736 "%s. Please either add or remove brackets to correct this problem.";
00737 int num_lines = 0;
00738 int msg_len = 0;
00739 TIntLinkPtr t;
00740 char * line_text_list;
00741 char * line_text_list_offset;
00742
00743 if (errfunc == NULL || offset_list == NULL) {
00744 return;
00745 }
00746
00747 for (t = offset_list; t != NULL; t = t->next)
00748 {
00749 num_lines ++;
00750 }
00751 msg_len = num_lines * (kMaxPrintedIntLen + 2);
00752 if (num_lines > 1)
00753 {
00754 msg_len += 4;
00755 }
00756 line_text_list = (char *) malloc (msg_len);
00757 if (line_text_list == NULL) return;
00758 line_text_list_offset = line_text_list;
00759 for (t = offset_list; t != NULL; t = t->next)
00760 {
00761 if (t->next == NULL)
00762 {
00763 sprintf (line_text_list_offset, "%d", t->ival);
00764 }
00765 else if (num_lines == 2)
00766 {
00767 sprintf (line_text_list_offset, "%d and ", t->ival);
00768 }
00769 else if (t->next->next == NULL)
00770 {
00771 sprintf (line_text_list_offset, "%d, and ", t->ival);
00772 }
00773 else
00774 {
00775 sprintf (line_text_list_offset, "%d, ", t->ival);
00776 }
00777 line_text_list_offset += strlen (line_text_list_offset);
00778 }
00779
00780 msg_len += strlen (msg) + 1;
00781
00782 eip = ErrorInfoNew (NULL);
00783 if (eip != NULL) {
00784 eip->category = eAlnErr_BadData;
00785 eip->message = (char *) malloc (msg_len);
00786 if (eip->message != NULL) {
00787 sprintf (eip->message, msg, line_text_list);
00788 }
00789 errfunc (eip, errdata);
00790 }
00791 free (line_text_list);
00792 }
00793
00794
00795
00796
00797
00798 static void s_ReportOrgCommentError
00799 (char * linestring,
00800 FReportErrorFunction errfunc,
00801 void * errdata)
00802 {
00803 TErrorInfoPtr eip;
00804 const char * msg = "This line may contain an improperly formatted organism description.\n"
00805 "Organism descriptions should be of the form [org=tax name] or [organism=tax name].\n";
00806
00807 if (errfunc == NULL || linestring == NULL) {
00808 return;
00809 }
00810
00811 eip = ErrorInfoNew (NULL);
00812 if (eip != NULL) {
00813 eip->category = eAlnErr_BadData;
00814 eip->message = (char *) malloc (strlen (msg) + strlen (linestring) + 1);
00815 if (eip->message != NULL) {
00816 strcpy (eip->message, msg);
00817 strcat (eip->message, linestring);
00818 }
00819 errfunc (eip, errdata);
00820 }
00821 }
00822
00823
00824
00825
00826
00827 static void s_ReportBadNumSegError
00828 (int line_num,
00829 int num_seg,
00830 int num_seg_exp,
00831 FReportErrorFunction errfunc,
00832 void * errdata)
00833 {
00834 TErrorInfoPtr eip;
00835 const char * msg = "This segmented set contains a different number of segments (%d) than expected (%d).\n";
00836
00837 if (errfunc == NULL) {
00838 return;
00839 }
00840
00841 eip = ErrorInfoNew (NULL);
00842 if (eip != NULL) {
00843 eip->line_num = line_num;
00844 eip->category = eAlnErr_BadData;
00845 eip->message = (char *) malloc (strlen (msg) + 2 * kMaxPrintedIntLen + 1);
00846 if (eip->message != NULL) {
00847 sprintf (eip->message, msg, num_seg, num_seg_exp);
00848 }
00849 errfunc (eip, errdata);
00850 }
00851 }
00852
00853
00854
00855
00856
00857
00858 extern TSequenceInfoPtr SequenceInfoNew (void)
00859 {
00860 TSequenceInfoPtr sip;
00861
00862 sip = (TSequenceInfoPtr) malloc (sizeof (SSequenceInfo));
00863 if (sip == NULL) {
00864 return NULL;
00865 }
00866 sip->missing = strdup ("?");
00867 sip->beginning_gap = strdup (".");
00868 sip->middle_gap = strdup ("-");
00869 sip->end_gap = strdup (".");
00870 sip->match = strdup (".");
00871 sip->alphabet = NULL;
00872 return sip;
00873 }
00874
00875
00876
00877
00878
00879 extern void SequenceInfoFree (TSequenceInfoPtr sip)
00880 {
00881 if (sip == NULL) {
00882 return;
00883 }
00884 free (sip->missing);
00885 free (sip->beginning_gap);
00886 free (sip->middle_gap);
00887 free (sip->end_gap);
00888 free (sip->match);
00889 sip->alphabet = NULL;
00890 free (sip);
00891 }
00892
00893
00894
00895
00896 static void
00897 s_ReportUnusedLine
00898 (int line_num_start,
00899 int line_num_stop,
00900 TLineInfoPtr line_val,
00901 FReportErrorFunction errfunc,
00902 void * errdata)
00903 {
00904 TErrorInfoPtr eip;
00905 const char * errformat1 = "Line %d could not be assigned to an interleaved block";
00906 const char * errformat2 = "Lines %d through %d could not be assigned to an interleaved block";
00907 const char * errformat3 = "Contents of unused line: %s";
00908 int skip;
00909
00910 if (errfunc == NULL || line_val == NULL) {
00911 return;
00912 }
00913
00914 eip = ErrorInfoNew (NULL);
00915 if (eip != NULL) {
00916 eip->category = eAlnErr_BadFormat;
00917 eip->line_num = line_num_start;
00918 if (line_num_start == line_num_stop) {
00919 eip->message = (char *) malloc (strlen (errformat1)
00920 + kMaxPrintedIntLen + 1);
00921 if (eip->message != NULL) {
00922 sprintf (eip->message, errformat1, line_num_start);
00923 }
00924 } else {
00925 eip->message = (char *) malloc (strlen (errformat2)
00926 + 2 * kMaxPrintedIntLen + 1);
00927 if (eip->message != NULL) {
00928 sprintf (eip->message, errformat2, line_num_start,
00929 line_num_stop);
00930 }
00931 }
00932 errfunc (eip, errdata);
00933 }
00934
00935 for (skip = line_num_start;
00936 skip < line_num_stop + 1 && line_val != NULL;
00937 skip++) {
00938 if (line_val->data == NULL) {
00939 continue;
00940 }
00941 eip = ErrorInfoNew (NULL);
00942 if (eip != NULL) {
00943 eip->category = eAlnErr_BadFormat;
00944 eip->line_num = skip;
00945 eip->message = (char *) malloc (strlen (errformat3)
00946 + strlen (line_val->data) + 1);
00947 if (eip->message != NULL) {
00948 sprintf (eip->message, errformat3, line_val->data);
00949 }
00950 errfunc (eip, errdata);
00951 }
00952 line_val = line_val->next;
00953 }
00954 }
00955
00956
00957
00958
00959
00960
00961
00962
00963
00964
00965 static TIntLinkPtr
00966 s_IntLinkNew
00967 (int ival,
00968 TIntLinkPtr list)
00969 {
00970 TIntLinkPtr ilp, last;
00971
00972 ilp = (TIntLinkPtr) malloc (sizeof (SIntLink));
00973 if (ilp == NULL) {
00974 return NULL;
00975 }
00976 ilp->ival = ival;
00977 ilp->next = NULL;
00978 last = list;
00979 while (last != NULL && last->next != NULL) {
00980 last = last->next;
00981 }
00982 if (last != NULL) {
00983 last->next = ilp;
00984 }
00985 return ilp;
00986 }
00987
00988
00989
00990
00991
00992 static void s_IntLinkFree (TIntLinkPtr ilp)
00993 {
00994 if (ilp == NULL) {
00995 return;
00996 }
00997 s_IntLinkFree (ilp->next);
00998 free (ilp);
00999 }
01000
01001
01002
01003
01004
01005
01006
01007
01008
01009
01010
01011 static TSizeInfoPtr s_SizeInfoNew (TSizeInfoPtr list)
01012 {
01013 TSizeInfoPtr sip, last;
01014
01015 sip = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
01016 if (sip == NULL) {
01017 return NULL;
01018 }
01019
01020 sip->size_value = 0;
01021 sip->num_appearances = 0;
01022 sip->next = NULL;
01023 last = list;
01024 while (last != NULL && last->next != NULL) {
01025 last = last->next;
01026 }
01027 if (last != NULL) {
01028 last->next = sip;
01029 }
01030 return sip;
01031 }
01032
01033
01034
01035
01036
01037 static void s_SizeInfoFree (TSizeInfoPtr list)
01038 {
01039 if (list == NULL) {
01040 return;
01041 }
01042 s_SizeInfoFree (list->next);
01043 list->next = NULL;
01044 free (list);
01045 }
01046
01047
01048
01049
01050
01051 static EBool
01052 s_SizeInfoIsEqual
01053 (TSizeInfoPtr s1,
01054 TSizeInfoPtr s2)
01055 {
01056 if (s1 == NULL
01057 || s2 == NULL
01058 || s1->size_value != s2->size_value
01059 || s1->num_appearances != s2->num_appearances) {
01060 return eFalse;
01061 }
01062 return eTrue;
01063 }
01064
01065
01066
01067
01068
01069
01070
01071
01072
01073 static TSizeInfoPtr s_AddSizeInfoAppearances
01074 (TSizeInfoPtr list,
01075 int size_value,
01076 int num_appearances)
01077 {
01078 TSizeInfoPtr p, last;
01079
01080 last = NULL;
01081 for (p = list; p != NULL && p->size_value != size_value; p = p->next) {
01082 last = p;
01083 }
01084 if (p == NULL) {
01085 p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
01086 if (p == NULL) {
01087 return NULL;
01088 }
01089 p->size_value = size_value;
01090 p->num_appearances = num_appearances;
01091 p->next = 0;
01092 if (last == NULL) {
01093 list = p;
01094 } else {
01095 last->next = p;
01096 }
01097 } else {
01098 p->num_appearances += num_appearances;
01099 }
01100 return list;
01101 }
01102
01103
01104
01105
01106
01107
01108
01109
01110
01111 static TSizeInfoPtr
01112 s_AddSizeInfo
01113 (TSizeInfoPtr list,
01114 int size_value)
01115 {
01116 return s_AddSizeInfoAppearances (list, size_value, 1);
01117 }
01118
01119
01120
01121
01122
01123
01124
01125
01126 static TSizeInfoPtr s_GetMostPopularSizeInfo (TSizeInfoPtr list)
01127 {
01128 TSizeInfoPtr p, best;
01129
01130 if (list == NULL) {
01131 return NULL;
01132 }
01133
01134 best = list;
01135 for (p = list->next; p != NULL; p = p->next) {
01136 if (p->num_appearances > best->num_appearances
01137 || (p->num_appearances == best->num_appearances
01138 && p->size_value > best->size_value)) {
01139 best = p;
01140 }
01141 }
01142 return best;
01143 }
01144
01145
01146
01147
01148
01149
01150
01151
01152 static int s_GetMostPopularSize (TSizeInfoPtr list)
01153 {
01154 TSizeInfoPtr best;
01155
01156 best = s_GetMostPopularSizeInfo (list);
01157 if (best == NULL) {
01158 return 0;
01159 }
01160 if (best->num_appearances > 1) {
01161 return best->size_value;
01162 } else {
01163 return 0;
01164 }
01165 }
01166
01167
01168
01169
01170
01171 static SLengthListPtr s_LengthListNew (SLengthListPtr list)
01172 {
01173 SLengthListPtr llp, last;
01174
01175 llp = (SLengthListPtr) malloc (sizeof (SLengthListData));
01176 if (llp == NULL) {
01177 return NULL;
01178 }
01179
01180 llp->lengthrepeats = NULL;
01181 llp->num_appearances = 0;
01182 llp->next = NULL;
01183
01184 last = list;
01185 while (last != NULL && last->next != NULL) {
01186 last = last->next;
01187 }
01188 if (last != NULL) {
01189 last->next = llp;
01190 }
01191 return llp;
01192 }
01193
01194
01195
01196
01197
01198 static void s_LengthListFree (SLengthListPtr llp)
01199 {
01200 if (llp == NULL) {
01201 return;
01202 }
01203 s_LengthListFree (llp->next);
01204 s_SizeInfoFree (llp->lengthrepeats);
01205 free (llp);
01206 }
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217 static void
01218 s_AddLengthRepeat
01219 (SLengthListPtr llp,
01220 int size_value)
01221 {
01222 TSizeInfoPtr p, last;
01223
01224 if (llp == NULL) {
01225 return;
01226 }
01227
01228 last = NULL;
01229 for (p = llp->lengthrepeats; p != NULL; p = p->next) {
01230 last = p;
01231 }
01232 if (last == NULL || last->size_value != size_value) {
01233 p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
01234 if (p == NULL) {
01235 return;
01236 }
01237 p->size_value = size_value;
01238 p->num_appearances = 1;
01239 p->next = 0;
01240 if (last == NULL) {
01241 llp->lengthrepeats = p;
01242 } else {
01243 last->next = p;
01244 }
01245 } else {
01246 last->num_appearances ++;
01247 }
01248 }
01249
01250
01251
01252
01253
01254
01255
01256
01257
01258 static EBool
01259 s_DoLengthPatternsMatch
01260 (SLengthListPtr llp1,
01261 SLengthListPtr llp2)
01262 {
01263 TSizeInfoPtr sip1, sip2;
01264
01265 if (llp1 == NULL || llp2 == NULL
01266 || llp1->lengthrepeats == NULL
01267 || llp2->lengthrepeats == NULL) {
01268 return eFalse;
01269 }
01270 for (sip1 = llp1->lengthrepeats, sip2 = llp2->lengthrepeats;
01271 sip1 != NULL && sip2 != NULL;
01272 sip1 = sip1->next, sip2 = sip2->next) {
01273 if ( ! s_SizeInfoIsEqual (sip1, sip2)
01274 || (sip1->next == NULL && sip2->next != NULL)
01275 || (sip1->next != NULL && sip2->next == NULL)) {
01276 return eFalse;
01277 }
01278 }
01279 return eTrue;
01280 }
01281
01282
01283
01284
01285
01286
01287
01288
01289 static SLengthListPtr
01290 s_AddLengthList
01291 (SLengthListPtr list,
01292 SLengthListPtr llp)
01293 {
01294 SLengthListPtr prev_llp;
01295
01296 if (list == NULL) {
01297 list = llp;
01298 } else {
01299 prev_llp = list;
01300 while ( prev_llp->next && ! s_DoLengthPatternsMatch (prev_llp, llp)) {
01301 prev_llp = prev_llp->next;
01302 }
01303 if (s_DoLengthPatternsMatch (prev_llp, llp)) {
01304 prev_llp->num_appearances ++;
01305 s_LengthListFree (llp);
01306 } else {
01307 prev_llp->next = llp;
01308 }
01309 }
01310 return list;
01311 }
01312
01313
01314
01315
01316
01317
01318
01319
01320
01321
01322
01323 static TLineInfoPtr
01324 s_LineInfoNew
01325 (char * string,
01326 int line_num,
01327 int line_offset)
01328 {
01329 TLineInfoPtr lip;
01330
01331 lip = (TLineInfoPtr) malloc (sizeof (SLineInfo));
01332 if (lip == NULL) {
01333 return NULL;
01334 }
01335 lip->data = strdup (string);
01336 lip->line_num = line_num;
01337 lip->line_offset = line_offset;
01338 lip->delete_me = eFalse;
01339 lip->next = NULL;
01340 return lip;
01341 }
01342
01343
01344
01345
01346
01347 static void s_LineInfoFree (TLineInfoPtr lip)
01348 {
01349 TLineInfoPtr next_lip;
01350 if (lip == NULL) {
01351 return;
01352 }
01353 while (lip != NULL) {
01354 next_lip = lip->next;
01355 lip->next = NULL;
01356 free (lip->data);
01357 free (lip);
01358 lip = next_lip;
01359 }
01360 }
01361
01362
01363
01364
01365
01366
01367 static TLineInfoPtr s_DeleteLineInfos (TLineInfoPtr list)
01368 {
01369 TLineInfoPtr prev = NULL;
01370 TLineInfoPtr lip, nextlip;
01371
01372 lip = list;
01373 while (lip != NULL) {
01374 nextlip = lip->next;
01375 if (lip->delete_me) {
01376 if (prev != NULL) {
01377 prev->next = lip->next;
01378 } else {
01379 list = lip->next;
01380 }
01381 lip->next = NULL;
01382 s_LineInfoFree (lip);
01383 } else {
01384 prev = lip;
01385 }
01386 lip = nextlip;
01387 }
01388 return list;
01389 }
01390
01391
01392
01393
01394
01395
01396
01397
01398 static TLineInfoPtr
01399 s_AddLineInfo
01400 (TLineInfoPtr list,
01401 char * string,
01402 int line_num,
01403 int line_offset)
01404 {
01405 TLineInfoPtr lip, p;
01406
01407 if (string == NULL) {
01408 return list;
01409 }
01410 lip = s_LineInfoNew (string, line_num, line_offset);
01411 if (lip == NULL) {
01412 return NULL;
01413 }
01414 if (list == NULL) {
01415 list = lip;
01416 } else {
01417 p = list;
01418 while (p != NULL && p->next != NULL) {
01419 p = p->next;
01420 }
01421 p->next = lip;
01422 }
01423 return list;
01424 }
01425
01426
01427 static TBracketedCommentListPtr s_BracketedCommentListNew
01428 (TBracketedCommentListPtr list,
01429 char * string,
01430 int line_num,
01431 int line_offset)
01432 {
01433 TBracketedCommentListPtr comment;
01434
01435 comment = (TBracketedCommentListPtr) malloc (sizeof (SBracketedCommentList));
01436 if (comment == NULL) {
01437 return NULL;
01438 }
01439 comment->comment_lines = s_LineInfoNew (string, line_num, line_offset);
01440 comment->next = NULL;
01441
01442 if (list != NULL) {
01443 while (list->next != NULL) {
01444 list = list->next;
01445 }
01446 list->next = comment;
01447 }
01448
01449 return comment;
01450 }
01451
01452
01453 static void s_BracketedCommentListFree (TBracketedCommentListPtr list)
01454 {
01455 if (list == NULL) {
01456 return;
01457 }
01458 s_BracketedCommentListFree (list->next);
01459 list->next = NULL;
01460 s_LineInfoFree (list->comment_lines);
01461 }
01462
01463
01464 static void s_BracketedCommentListAddLine
01465 (TBracketedCommentListPtr comment,
01466 char * string,
01467 int line_num,
01468 int line_offset)
01469 {
01470 if (comment == NULL) {
01471 return;
01472 }
01473
01474 comment->comment_lines = s_AddLineInfo (comment->comment_lines, string, line_num, line_offset);
01475 }
01476
01477
01478 static int s_CountSequencesInBracketedComment (TBracketedCommentListPtr comment)
01479 {
01480 TLineInfoPtr lip;
01481 int num_segments = 0;
01482 EBool skipped_line_since_last_defline = eTrue;
01483
01484 if (comment == NULL || comment->comment_lines == NULL) {
01485 return 0;
01486 }
01487
01488 lip = comment->comment_lines;
01489
01490 if (lip->data[0] != '[' || strspn (lip->data + 1, " \t\r\n") != strlen (lip->data + 1))
01491 {
01492 return 0;
01493 }
01494 lip = lip->next;
01495 while (lip != NULL && lip->next != NULL)
01496 {
01497 if (lip->data[0] == '>')
01498 {
01499 if (!skipped_line_since_last_defline)
01500 {
01501 return 0;
01502 }
01503 else
01504 {
01505 num_segments ++;
01506 skipped_line_since_last_defline = eFalse;
01507 }
01508 }
01509 else
01510 {
01511 skipped_line_since_last_defline = eTrue;
01512 }
01513 lip = lip->next;
01514 }
01515
01516
01517 if (lip->data[0] != ']' || strspn (lip->data + 1, " \t\r\n") != strlen (lip->data + 1))
01518 {
01519 return 0;
01520 }
01521
01522 return num_segments;
01523 }
01524
01525
01526
01527
01528
01529
01530
01531 static int s_GetNumSegmentsInAlignment
01532 (TBracketedCommentListPtr comment_list,
01533 FReportErrorFunction errfunc,
01534 void * errdata)
01535 {
01536 TBracketedCommentListPtr comment;
01537 TSizeInfoPtr segcount_list = NULL;
01538 int num_segments = 1;
01539 int num_segments_this_bracket;
01540 int num_segments_expected;
01541 TSizeInfoPtr best;
01542
01543 if (comment_list == NULL)
01544 {
01545 return num_segments;
01546 }
01547
01548 for (comment = comment_list; comment != NULL; comment = comment->next)
01549 {
01550 num_segments_this_bracket = s_CountSequencesInBracketedComment (comment);
01551 segcount_list = s_AddSizeInfoAppearances (segcount_list,
01552 num_segments_this_bracket,
01553 1);
01554 if (comment != comment_list && segcount_list->next != NULL)
01555 {
01556 best = s_GetMostPopularSizeInfo (segcount_list);
01557 num_segments_expected = best->size_value;
01558
01559 if (num_segments_expected != num_segments_this_bracket)
01560 {
01561 s_ReportBadNumSegError (comment->comment_lines->line_num,
01562 num_segments_this_bracket, num_segments_expected,
01563 errfunc, errdata);
01564 }
01565 }
01566 }
01567 if (segcount_list != NULL && segcount_list->next == NULL && segcount_list->size_value > 0)
01568 {
01569 num_segments = segcount_list->size_value;
01570 }
01571 s_SizeInfoFree (segcount_list);
01572 return num_segments;
01573 }
01574
01575
01576
01577
01578 static TIntLinkPtr GetSegmentOffsetList (TBracketedCommentListPtr comment_list)
01579 {
01580 TIntLinkPtr new_offset, offset_list = NULL;
01581 TBracketedCommentListPtr comment;
01582 TLineInfoPtr lip;
01583
01584 if (comment_list == NULL)
01585 {
01586 return NULL;
01587 }
01588
01589 for (comment = comment_list; comment != NULL; comment = comment->next)
01590 {
01591 if (s_CountSequencesInBracketedComment (comment) == 0)
01592 {
01593 continue;
01594 }
01595 for (lip = comment->comment_lines; lip != NULL; lip = lip->next)
01596 {
01597 if (lip->data != NULL && lip->data[0] == '>')
01598 {
01599 new_offset = s_IntLinkNew (lip->line_num + 1, offset_list);
01600 if (offset_list == NULL) offset_list = new_offset;
01601 }
01602 }
01603 }
01604 return offset_list;
01605 }
01606
01607 static char * s_TokenizeString (char * str, char *delimiter, char **last)
01608 {
01609 int skip;
01610 int length;
01611
01612 if (str == NULL) {
01613 str = *last;
01614 }
01615 if (delimiter == NULL) {
01616 *last = NULL;
01617 return NULL;
01618 }
01619
01620 if (str == NULL || *str == 0) {
01621 return NULL;
01622 }
01623 skip = strspn (str, delimiter);
01624 str += skip;
01625 length = strcspn (str, delimiter);
01626 *last = str + length;
01627 if (**last != 0) {
01628 **last = 0;
01629 (*last) ++;
01630 }
01631 return str;
01632 }
01633
01634
01635
01636
01637
01638
01639
01640 static TLineInfoPtr s_BuildTokenList (TLineInfoPtr line_list)
01641 {
01642 TLineInfoPtr first_token, lip;
01643 char * tmp;
01644 char * piece;
01645 char * last;
01646 int line_pos;
01647
01648 first_token = NULL;
01649
01650 for (lip = line_list; lip != NULL; lip = lip->next) {
01651 if (lip->data != NULL && (tmp = strdup (lip->data)) != NULL) {
01652 piece = s_TokenizeString (tmp, " \t\r", &last);
01653 while (piece != NULL) {
01654 line_pos = piece - tmp;
01655 line_pos += lip->line_offset;
01656 first_token = s_AddLineInfo (first_token, piece,
01657 lip->line_num,
01658 line_pos);
01659 piece = s_TokenizeString (NULL, " \t\r", &last);
01660 }
01661 free (tmp);
01662 }
01663 }
01664 return first_token;
01665 }
01666
01667
01668
01669
01670
01671
01672
01673 static char * s_LineInfoMergeAndStripSpaces (TLineInfoPtr list)
01674 {
01675 TLineInfoPtr lip;
01676 int len;
01677 char * result;
01678 char * cp_to;
01679 char * cp_from;
01680
01681 if (list == NULL) {
01682 return NULL;
01683 }
01684 len = 0;
01685 for (lip = list; lip != NULL; lip = lip->next) {
01686 if (lip->data != NULL) {
01687 len += strlen (lip->data);
01688 }
01689 }
01690 result = (char *) malloc (len + 1);
01691 if (result == NULL) {
01692 return result;
01693 }
01694 cp_to = result;
01695 for (lip = list; lip != NULL; lip = lip->next) {
01696 if (lip->data != NULL) {
01697 cp_from = lip->data;
01698 while (*cp_from != 0) {
01699 if (! isspace ((unsigned char)*cp_from)) {
01700 *cp_to = *cp_from;
01701 cp_to ++;
01702 }
01703 cp_from ++;
01704 }
01705 }
01706 }
01707 *cp_to = 0;
01708 return result;
01709 }
01710
01711
01712
01713
01714
01715
01716
01717
01718
01719
01720
01721
01722
01723
01724
01725
01726
01727
01728
01729 static void s_LineInfoReaderAdvancePastSpace (TLineInfoReaderPtr lirp)
01730 {
01731 if (lirp->curr_line_pos == NULL) {
01732 return;
01733 }
01734 while ( isspace ((unsigned char) *lirp->curr_line_pos)
01735 || *lirp->curr_line_pos == 0) {
01736 while ( isspace ((unsigned char)*lirp->curr_line_pos)) {
01737 lirp->curr_line_pos ++;
01738 }
01739 if (*lirp->curr_line_pos == 0) {
01740 lirp->curr_line = lirp->curr_line->next;
01741 while (lirp->curr_line != NULL
01742 && lirp->curr_line->data == NULL) {
01743 lirp->curr_line = lirp->curr_line->next;
01744 }
01745 if (lirp->curr_line == NULL) {
01746 lirp->curr_line_pos = NULL;
01747 return;
01748 } else {
01749 lirp->curr_line_pos = lirp->curr_line->data;
01750 }
01751 }
01752 }
01753 }
01754
01755
01756
01757
01758
01759 static void s_LineInfoReaderReset (TLineInfoReaderPtr lirp)
01760 {
01761 if (lirp == NULL) {
01762 return;
01763 }
01764 lirp->curr_line = lirp->first_line;
01765
01766 while (lirp->curr_line != NULL && lirp->curr_line->data == NULL) {
01767 lirp->curr_line = lirp->curr_line->next;
01768 }
01769 if (lirp->curr_line == NULL) {
01770 lirp->curr_line_pos = NULL;
01771 lirp->data_pos = -1;
01772 } else {
01773 lirp->curr_line_pos = lirp->curr_line->data;
01774 s_LineInfoReaderAdvancePastSpace (lirp);
01775 if (lirp->curr_line_pos == NULL) {
01776 lirp->data_pos = -1;
01777 } else {
01778 lirp->data_pos = 0;
01779 }
01780 }
01781 }
01782
01783
01784
01785
01786
01787
01788
01789
01790 static TLineInfoReaderPtr s_LineInfoReaderNew (TLineInfoPtr line_list)
01791 {
01792 TLineInfoReaderPtr lirp;
01793
01794 if (line_list == NULL) {
01795 return NULL;
01796 }
01797 lirp = (TLineInfoReaderPtr) malloc (sizeof (SLineInfoReader));
01798 if (lirp == NULL) {
01799 return NULL;
01800 }
01801
01802 lirp->first_line = line_list;
01803 s_LineInfoReaderReset (lirp);
01804 return lirp;
01805 }
01806
01807
01808
01809
01810
01811
01812
01813
01814
01815
01816 static int s_LineInfoReaderGetCurrentLineNumber (TLineInfoReaderPtr lirp)
01817 {
01818 if (lirp == NULL || lirp->curr_line == NULL) {
01819 return -1;
01820 } else {
01821 return lirp->curr_line->line_num;
01822 }
01823 }
01824
01825
01826
01827
01828
01829
01830
01831
01832
01833
01834
01835 static int s_LineInfoReaderGetCurrentLineOffset (TLineInfoReaderPtr lirp)
01836 {
01837 if (lirp == NULL || lirp->curr_line == NULL
01838 || lirp->curr_line_pos == NULL) {
01839 return -1;
01840 } else {
01841 return lirp->curr_line->line_offset + lirp->curr_line_pos
01842 - lirp->curr_line->data;
01843 }
01844 }
01845
01846
01847
01848
01849
01850
01851 static void s_LineInfoReaderFree (TLineInfoReaderPtr lirp)
01852 {
01853 if (lirp == NULL) {
01854 return;
01855 }
01856 free (lirp);
01857 lirp = NULL;
01858 }
01859
01860
01861
01862
01863
01864
01865
01866
01867
01868
01869
01870
01871 static char
01872 s_FindNthDataChar
01873 (TLineInfoReaderPtr lirp,
01874 int pos)
01875 {
01876 if (lirp == NULL || lirp->first_line == NULL || pos < 0
01877 || lirp->data_pos == -1) {
01878 return 0;
01879 }
01880
01881 if (lirp->data_pos == pos) {
01882 if (lirp->curr_line_pos == NULL) {
01883 return 0;
01884 } else {
01885 return *lirp->curr_line_pos;
01886 }
01887 }
01888 if (lirp->data_pos > pos) {
01889 s_LineInfoReaderReset (lirp);
01890 }
01891
01892 while (lirp->data_pos < pos && lirp->curr_line != NULL) {
01893 lirp->curr_line_pos ++;
01894
01895 s_LineInfoReaderAdvancePastSpace (lirp);
01896 lirp->data_pos ++;
01897 }
01898 if (lirp->curr_line_pos != NULL) {
01899 return *lirp->curr_line_pos;
01900 } else {
01901 return 0;
01902 }
01903 }
01904
01905
01906
01907
01908
01909
01910
01911
01912
01913
01914
01915
01916
01917
01918
01919 static TStringCountPtr s_StringCountNew (TStringCountPtr list)
01920 {
01921 TStringCountPtr new_item, last;
01922
01923 new_item = (TStringCountPtr) malloc (sizeof (SStringCount));
01924 if (new_item == NULL) {
01925 return NULL;
01926 }
01927 new_item->string = NULL;
01928 new_item->num_appearances = 0;
01929 new_item->line_numbers = NULL;
01930 new_item->next = NULL;
01931
01932 last = list;
01933 while (last != NULL && last->next != NULL) {
01934 last = last->next;
01935 }
01936 if (last != NULL) {
01937 last->next = new_item;
01938 }
01939 return new_item;
01940 }
01941
01942
01943
01944
01945
01946
01947 static void s_StringCountFree (TStringCountPtr list)
01948 {
01949 if (list == NULL) {
01950 return;
01951 }
01952 s_StringCountFree (list->next);
01953 s_IntLinkFree (list->line_numbers);
01954 free (list);
01955 }
01956
01957
01958
01959
01960
01961
01962
01963
01964
01965 static TStringCountPtr s_AddStringCount (
01966 char * string,
01967 int line_num,
01968 TStringCountPtr list
01969 )
01970 {
01971 TStringCountPtr add_to, last;
01972 TIntLinkPtr new_offset;
01973
01974 if (string == NULL) {
01975 for (add_to = list;
01976 add_to != NULL && add_to->string != NULL;
01977 add_to = add_to->next) {
01978 last = add_to;
01979 }
01980 } else {
01981 for (add_to = list;
01982 add_to != NULL
01983 && (add_to->string == NULL
01984 || strcmp (string, add_to->string) != 0);
01985 add_to = add_to->next) {
01986 last = add_to;
01987 }
01988 }
01989
01990 if (add_to == NULL) {
01991 add_to = s_StringCountNew (list);
01992 if (list == NULL) list = add_to;
01993 if (add_to != NULL) {
01994 add_to->string = string;
01995 }
01996 }
01997 if (add_to != NULL) {
01998 add_to->num_appearances ++;
01999 new_offset = s_IntLinkNew (line_num, add_to->line_numbers);
02000 if (add_to->line_numbers == NULL) {
02001 add_to->line_numbers = new_offset;
02002 }
02003 }
02004 return list;
02005 }
02006
02007
02008
02009
02010
02011
02012
02013 static int s_StringNICmp (char * str1, char *str2, int cmp_count)
02014 {
02015 char * cp1;
02016 char * cp2;
02017 int char_count, diff;
02018
02019 if (str1 == NULL && str2 == NULL) {
02020 return 0;
02021 }
02022 if (str1 == NULL) {
02023 return -1;
02024 }
02025 if (str2 == NULL) {
02026 return 1;
02027 }
02028 cp1 = str1;
02029 cp2 = str2;
02030 char_count = 0;
02031 while (*cp1 != 0 && *cp2 != 0 && char_count < cmp_count) {
02032 diff = toupper ((unsigned char)(*cp1)) - toupper ((unsigned char)(*cp2));
02033 if (diff != 0) {
02034 return diff;
02035 }
02036 char_count ++;
02037 cp1++;
02038 cp2++;
02039 }
02040 if (char_count == cmp_count) {
02041 return 0;
02042 } else if (*cp1 == 0 && *cp2 != 0) {
02043 return -1;
02044 } else if (*cp1 != 0 && *cp2 == 0) {
02045 return 1;
02046 } else {
02047 return 0;
02048 }
02049 }
02050
02051
02052
02053
02054
02055 static int s_StringICmp (char * str1, char *str2)
02056 {
02057 char * cp1;
02058 char * cp2;
02059 int diff;
02060
02061 if (str1 == NULL && str2 == NULL) {
02062 return 0;
02063 }
02064 if (str1 == NULL) {
02065 return -1;
02066 }
02067 if (str2 == NULL) {
02068 return 1;
02069 }
02070 cp1 = str1;
02071 cp2 = str2;
02072 while (*cp1 != 0 && *cp2 != 0) {
02073 diff = toupper ((unsigned char) *cp1) - toupper ((unsigned char) *cp2);
02074 if (diff != 0) {
02075 return diff;
02076 }
02077 cp1++;
02078 cp2++;
02079 }
02080 if (*cp1 == 0 && *cp2 != 0) {
02081 return -1;
02082 } else if (*cp1 != 0 && *cp2 == 0) {
02083 return 1;
02084 } else {
02085 return 0;
02086 }
02087 }
02088
02089
02090
02091
02092
02093
02094
02095
02096
02097
02098
02099
02100 static void
02101 s_GetFASTAExpectedNumbers
02102 (char * str,
02103 SAlignRawFilePtr afrp)
02104 {
02105 char * cp;
02106 char * cpend;
02107 char replace;
02108 int first, second;
02109
02110 if (str == NULL || afrp == NULL) {
02111 return;
02112 }
02113 cp = str;
02114 while (! isdigit ((unsigned char)*cp) && *cp != 0) {
02115 cp++;
02116 }
02117
02118 cpend = cp;
02119 while (isdigit ((unsigned char)*cpend) && *cpend != 0) {
02120 cpend++;
02121 }
02122 if (cp == cpend) {
02123 return;
02124 }
02125 replace = *cpend;
02126 *cpend = 0;
02127 first = atol (cp);
02128 *cpend = replace;
02129
02130 cp = cpend;
02131 while (! isdigit ((unsigned char)*cp) && *cp != 0) {
02132 cp++;
02133 }
02134
02135 cpend = cp;
02136 while (isdigit ((unsigned char)*cpend) && *cpend != 0) {
02137 cpend++;
02138 }
02139 if (cp == cpend) {
02140 return;
02141 }
02142 replace = *cpend;
02143 *cpend = 0;
02144 second = atol (cp);
02145 *cpend = replace;
02146
02147 if (first > 0 && second > 0) {
02148 afrp->expected_num_sequence = first;
02149 afrp->expected_sequence_len = second;
02150 }
02151
02152 }
02153
02154
02155
02156
02157
02158
02159 static EBool s_IsTwoNumbersSeparatedBySpace (char * str)
02160 {
02161 char * cp;
02162 EBool found_first_number = eFalse;
02163 EBool found_dividing_space = eFalse;
02164 EBool found_second_number = eFalse;
02165 EBool found_second_number_end = eFalse;
02166
02167 if (str == NULL) {
02168 return eFalse;
02169 }
02170 cp = str;
02171 while (*cp != 0) {
02172 if (! isdigit ((unsigned char)*cp) && ! isspace ((unsigned char)*cp)) {
02173 return eFalse;
02174 }
02175 if (! found_first_number) {
02176 if (! isdigit ((unsigned char)*cp)) {
02177 return eFalse;
02178 }
02179 found_first_number = eTrue;
02180 } else if (! found_dividing_space) {
02181 if ( isspace ((unsigned char) *cp)) {
02182 found_dividing_space = eTrue;
02183 } else if ( ! isdigit ((unsigned char)*cp)) {
02184 return eFalse;
02185 }
02186 } else if (! found_second_number) {
02187 if ( isdigit ((unsigned char)*cp)) {
02188 found_second_number = eTrue;
02189 } else if (! isspace ((unsigned char) *cp)) {
02190 return eFalse;
02191 }
02192 } else if (! found_second_number_end) {
02193 if ( isspace ((unsigned char) *cp)) {
02194 found_second_number_end = eTrue;
02195 } else if (! isdigit ((unsigned char)*cp)) {
02196 return eFalse;
02197 }
02198 } else if (! isspace ((unsigned char) *cp)) {
02199 return eFalse;
02200 }
02201 cp++;
02202 }
02203 if (found_second_number) {
02204 return eTrue;
02205 }
02206 return eFalse;
02207 }
02208
02209
02210
02211
02212
02213
02214
02215
02216 static EBool
02217 s_GetOneNexusSizeComment
02218 (char * str,
02219 char * valname,
02220 int * val)
02221 {
02222 char buf[MAX_PRINTED_INT_LEN_PLUS_ONE];
02223 char * cpstart;
02224 char * cpend;
02225 int maxlen;
02226
02227 if (str == NULL || valname == NULL || val == NULL) {
02228 return eFalse;
02229 }
02230
02231 cpstart = strstr (str, valname);
02232 if (cpstart == NULL) {
02233 return eFalse;
02234 }
02235 cpstart += strlen (valname);
02236 while (*cpstart != 0 && isspace ((unsigned char)*cpstart)) {
02237 cpstart++;
02238 }
02239 if (*cpstart != '=') {
02240 return eFalse;
02241 }
02242 cpstart ++;
02243 while (*cpstart != 0 && isspace ((unsigned char)*cpstart)) {
02244 cpstart++;
02245 }
02246
02247 if (! isdigit ((unsigned char)*cpstart)) {
02248 return eFalse;
02249 }
02250 cpend = cpstart + 1;
02251 while ( *cpend != 0 && isdigit ((unsigned char)*cpend)) {
02252 cpend ++;
02253 }
02254 maxlen = cpend - cpstart;
02255 if (maxlen > kMaxPrintedIntLen) maxlen = kMaxPrintedIntLen;
02256
02257 strncpy (buf, cpstart, maxlen);
02258 buf [maxlen] = 0;
02259 *val = atoi (buf);
02260 return eTrue;
02261 }
02262
02263
02264
02265
02266
02267
02268
02269 static void
02270 s_GetNexusSizeComments
02271 (char * str,
02272 EBool * found_ntax,
02273 EBool * found_nchar,
02274 SAlignRawFilePtr afrp)
02275 {
02276 int num_sequences;
02277 int num_chars;
02278
02279 if (str == NULL || found_nchar == NULL
02280 || found_ntax == NULL || afrp == NULL) {
02281 return;
02282 }
02283 if (! *found_ntax &&
02284 (s_GetOneNexusSizeComment (str, "ntax", &num_sequences)
02285 || s_GetOneNexusSizeComment (str, "NTAX", &num_sequences))) {
02286 afrp->expected_num_sequence = num_sequences;
02287 afrp->align_format_found = eTrue;
02288 *found_ntax = eTrue;
02289 }
02290 if (! *found_nchar &&
02291 (s_GetOneNexusSizeComment (str, "nchar", &num_chars)
02292 || s_GetOneNexusSizeComment (str, "NCHAR", &num_chars))) {
02293 afrp->expected_sequence_len = num_chars;
02294 afrp->align_format_found = eTrue;
02295 *found_nchar = eTrue;
02296 }
02297 }
02298
02299
02300
02301
02302
02303
02304
02305
02306 static char GetNexusTypechar (char * str, char * val_name)
02307 {
02308 char * cp;
02309 char * cpend;
02310
02311 if (str == NULL || val_name == NULL) {
02312 return 0;
02313 }
02314 cpend = strstr (str, ";");
02315 if (cpend == NULL) {
02316 return 0;
02317 }
02318 cp = strstr (str, val_name);
02319 if (cp == NULL || cp > cpend) {
02320 return 0;
02321 }
02322 cp += strlen (val_name);
02323 while ( isspace ((unsigned char)*cp)) {
02324 cp ++;
02325 }
02326 if (*cp != '=') {
02327 return 0;
02328 }
02329 cp++;
02330 while ( isspace ((unsigned char)*cp) || *cp == '\'') {
02331 cp ++;
02332 }
02333 return *cp;
02334 }
02335
02336
02337
02338
02339
02340
02341
02342
02343 static EBool s_CheckNexusCharInfo
02344 (char * str,
02345 TSequenceInfoPtr sequence_info,
02346 FReportErrorFunction errfunc,
02347 void * errdata)
02348 {
02349 char * cp;
02350 char c;
02351
02352 if (str == NULL || sequence_info == NULL) {
02353 return eFalse;
02354 }
02355
02356 cp = strstr (str, "format ");
02357 if (cp == NULL) {
02358 cp = strstr (str, "FORMAT ");
02359 }
02360 if (cp == NULL) {
02361 return eFalse;
02362 }
02363
02364 if (errfunc == NULL) {
02365 return eTrue;
02366 }
02367
02368 c = GetNexusTypechar (cp + 7, "missing");
02369 if (c == 0) {
02370 c = GetNexusTypechar (cp + 7, "MISSING");
02371 }
02372 if (c != 0 && sequence_info->missing != NULL
02373 && strchr (sequence_info->missing, c) == NULL)
02374 {
02375 s_ReportCharCommentError (sequence_info->missing, c, "MISSING",
02376 errfunc, errdata);
02377 }
02378
02379 c = GetNexusTypechar (cp + 7, "gap");
02380 if (c == 0) {
02381 c = GetNexusTypechar (cp + 7, "GAP");
02382 }
02383 if (c != 0 && sequence_info->middle_gap != NULL
02384 && strchr (sequence_info->middle_gap, c) == NULL)
02385 {
02386 s_ReportCharCommentError (sequence_info->middle_gap, c, "GAP",
02387 errfunc, errdata);
02388 }
02389
02390 c = GetNexusTypechar (cp + 7, "match");
02391 if (c == 0) {
02392 c = GetNexusTypechar (cp + 7, "MATCH");
02393 }
02394 if (c != 0 && sequence_info->match != NULL
02395 && strchr (sequence_info->match, c) == NULL)
02396 {
02397 s_ReportCharCommentError (sequence_info->match, c, "MATCH",
02398 errfunc, errdata);
02399 }
02400 return eTrue;
02401 }
02402
02403
02404 static char * s_ReplaceNexusTypeChar (char *str, char c)
02405 {
02406 if (str == NULL
02407 || c != *str
02408 || *(str + 1) != 0)
02409 {
02410 if (str != NULL)
02411 {
02412 free (str);
02413 }
02414 str = (char *)malloc (2 * sizeof (char));
02415 if (str != NULL)
02416 {
02417 str [0] = c;
02418 str [1] = 0;
02419 }
02420 }
02421 return str;
02422 }
02423
02424
02425
02426
02427
02428 static EBool s_UpdateNexusCharInfo
02429 (char * str,
02430 TSequenceInfoPtr sequence_info)
02431 {
02432 char * cp;
02433 char c;
02434
02435 if (str == NULL || sequence_info == NULL) {
02436 return eFalse;
02437 }
02438
02439 cp = strstr (str, "format ");
02440 if (cp == NULL) {
02441 cp = strstr (str, "FORMAT ");
02442 }
02443 if (cp == NULL) {
02444 return eFalse;
02445 }
02446
02447 c = GetNexusTypechar (cp + 7, "missing");
02448 if (c == 0) {
02449 c = GetNexusTypechar (cp + 7, "MISSING");
02450 }
02451 sequence_info->missing = s_ReplaceNexusTypeChar (sequence_info->missing, c);
02452
02453 c = GetNexusTypechar (cp + 7, "gap");
02454 if (c == 0) {
02455 c = GetNexusTypechar (cp + 7, "GAP");
02456 }
02457 sequence_info->beginning_gap = s_ReplaceNexusTypeChar (sequence_info->beginning_gap, c);
02458 sequence_info->middle_gap = s_ReplaceNexusTypeChar (sequence_info->middle_gap, c);
02459 sequence_info->end_gap = s_ReplaceNexusTypeChar (sequence_info->end_gap, c);
02460
02461 c = GetNexusTypechar (cp + 7, "match");
02462 if (c == 0) {
02463 c = GetNexusTypechar (cp + 7, "MATCH");
02464 }
02465 sequence_info->match = s_ReplaceNexusTypeChar (sequence_info->match, c);
02466
02467 return eTrue;
02468 }
02469
02470
02471
02472
02473
02474
02475
02476 static EBool s_IsConsensusLine (char * str)
02477 {
02478 if (str == NULL
02479 || strspn (str, "*:. \t\r\n") < strlen (str)
02480 || (strchr (str, '*') == NULL
02481 && strchr (str, ':') == NULL
02482 && strchr (str, '.') == NULL)) {
02483 return eFalse;
02484 } else {
02485 return eTrue;
02486 }
02487 }
02488
02489
02490
02491
02492
02493
02494 static EBool s_SkippableNexusComment (char *str)
02495 {
02496 char * last_semicolon;
02497
02498 if (str == NULL) {
02499 return eFalse;
02500 }
02501 last_semicolon = strrchr (str, ';');
02502 if (last_semicolon == NULL
02503 || strspn (last_semicolon + 1, " \t\r") != strlen (last_semicolon + 1)
02504 || strchr (str, ';') != last_semicolon) {
02505 return eFalse;
02506 }
02507 if (s_StringNICmp (str, "format ", 7) == 0
02508 || s_StringNICmp (str, "dimensions ", 11) == 0
02509 || s_StringNICmp (str, "dimensions ", 11) == 0
02510 || s_StringNICmp (str, "options ", 8) == 0
02511 || s_StringNICmp (str, "begin characters", 16) == 0
02512 || s_StringNICmp (str, "begin data", 10) == 0) {
02513 return eTrue;
02514 } else {
02515 return eFalse;
02516 }
02517 }
02518
02519
02520
02521
02522
02523
02524 static EBool s_SkippableString (char * str)
02525 {
02526 if (str == NULL
02527 || s_StringNICmp (str, "matrix", 6) == 0
02528 || s_StringNICmp (str, "#NEXUS", 6) == 0
02529 || s_StringNICmp (str, "CLUSTAL W", 8) == 0
02530 || s_SkippableNexusComment (str)
02531 || s_IsTwoNumbersSeparatedBySpace (str)
02532 || s_IsConsensusLine (str)
02533 || str [0] == ';') {
02534 return eTrue;
02535 } else {
02536 return eFalse;
02537 }
02538 }
02539
02540
02541
02542
02543
02544 static EBool s_IsAlnFormatString (char * str)
02545 {
02546 if (s_StringNICmp (str, "matrix", 6) == 0
02547 || s_StringNICmp (str, "#NEXUS", 6) == 0
02548 || s_StringNICmp (str, "CLUSTAL W", 8) == 0
02549 || s_SkippableNexusComment (str)
02550 || s_IsTwoNumbersSeparatedBySpace (str)
02551 || s_IsConsensusLine (str)) {
02552 return eTrue;
02553 } else {
02554 return eFalse;
02555 }
02556 }
02557
02558
02559
02560
02561 static EBool s_IsBlank (char * str)
02562 {
02563 size_t len;
02564
02565 if (str == NULL) {
02566 return eTrue;
02567 }
02568 len = strspn (str, " \t\r");
02569 if (len == strlen (str)) {
02570 return eTrue;
02571 }
02572 return eFalse;
02573 }
02574
02575
02576
02577
02578
02579
02580 static EBool s_FoundStopLine (char * linestring)
02581 {
02582 if (linestring == NULL) {
02583 return eFalse;
02584 }
02585 if (s_StringNICmp (linestring, "endblock", 8) == 0
02586 || s_StringNICmp (linestring, "end;", 4) == 0) {
02587 return eTrue;
02588 }
02589 return eFalse;
02590 }
02591
02592
02593
02594
02595
02596 static EBool s_IsASN1 (char * linestring)
02597 {
02598 if (linestring != NULL && strstr (linestring, "::=") != NULL) {
02599 return eTrue;
02600 } else {
02601 return eFalse;
02602 }
02603 }
02604
02605
02606
02607
02608
02609
02610
02611 static void s_CommentLocFree (TCommentLocPtr clp)
02612 {
02613 if (clp == NULL) {
02614 return;
02615 }
02616 s_CommentLocFree (clp->next);
02617 free (clp);
02618 }
02619
02620
02621
02622
02623
02624
02625
02626
02627 static TCommentLocPtr s_FindComment (char * string)
02628 {
02629 char * cp_start;
02630 char * cp_end;
02631 TCommentLocPtr clp;
02632
02633 if (string == NULL) {
02634 return NULL;
02635 }
02636 cp_start = strstr (string, "[");
02637 if (cp_start != NULL) {
02638 cp_end = strstr (cp_start, "]");
02639 if (cp_end != NULL) {
02640 clp = (TCommentLocPtr) malloc (sizeof (SCommentLoc));
02641 if (clp == NULL) {
02642 return NULL;
02643 }
02644 clp->start = cp_start;
02645 clp->end = cp_end;
02646 clp->next = NULL;
02647 return clp;
02648 }
02649 }
02650 return NULL;
02651 }
02652
02653
02654
02655 static void s_RemoveCommentFromLine (char * linestring)
02656 {
02657 TCommentLocPtr clp;
02658
02659 if (linestring == NULL) {
02660 return;
02661 }
02662
02663 clp = s_FindComment (linestring);
02664 while (clp != NULL) {
02665 strcpy (clp->start, clp->end + 1);
02666 s_CommentLocFree (clp);
02667 clp = s_FindComment (linestring);
02668 }
02669
02670
02671
02672
02673
02674 if ( linestring [0] == '>' && linestring [1] == 0) {
02675 linestring [0] = 0;
02676 }
02677
02678
02679 if (strspn (linestring, " \t\r") == strlen (linestring)) {
02680 linestring [0] = 0;
02681 }
02682
02683 }
02684
02685
02686
02687
02688
02689 static EBool s_IsOrganismComment (TCommentLocPtr clp)
02690 {
02691 int len;
02692 char * cp;
02693 char * cp_end;
02694
02695 if (clp == NULL || clp->start == NULL || clp->end == NULL) {
02696 return eFalse;
02697 }
02698
02699 cp = clp->start;
02700 if (*cp != '[') {
02701 return eFalse;
02702 }
02703 cp ++;
02704 len = strspn ( clp->start, " \t\r");
02705 cp = cp + len;
02706 cp_end = strstr (cp, "=");
02707 if (cp_end == NULL) {
02708 return eFalse;
02709 }
02710 cp_end --;
02711 while (cp_end > cp && isspace ((unsigned char)*cp_end)) {
02712 cp_end --;
02713 }
02714 cp_end ++;
02715 if ((cp_end - cp == 3 && s_StringNICmp (cp, "org", 3) == 0)
02716 || (cp_end - cp == 8 && s_StringNICmp (cp, "organism", 8) == 0)) {
02717 return eTrue;
02718 }
02719 return eFalse;
02720 }
02721
02722
02723
02724
02725
02726
02727
02728
02729 static TCommentLocPtr s_FindOrganismComment (char * string)
02730 {
02731 TCommentLocPtr clp, next_clp;
02732
02733 if (string == NULL) {
02734 return NULL;
02735 }
02736
02737 clp = s_FindComment (string);
02738 while (clp != NULL && ! s_IsOrganismComment (clp)) {
02739 clp = s_FindComment (clp->end);
02740 }
02741
02742 if (clp == NULL) {
02743 return NULL;
02744 }
02745
02746 next_clp = s_FindComment (clp->end);
02747 while (next_clp != NULL &&
02748 (int) strspn (clp->end + 1, " \t\r") == next_clp->start - clp->end - 1
02749 && ! s_IsOrganismComment (next_clp))
02750 {
02751 clp->end = next_clp->end;
02752 next_clp = s_FindComment (clp->end);
02753 }
02754 return clp;
02755 }
02756
02757
02758
02759 static void s_RemoveOrganismCommentFromLine (char * string)
02760 {
02761 TCommentLocPtr clp;
02762
02763 while ((clp = s_FindOrganismComment (string)) != NULL) {
02764 strcpy (clp->start, clp->end + 1);
02765 s_CommentLocFree (clp);
02766 }
02767 }
02768
02769
02770
02771
02772
02773
02774
02775 static TCommentLocPtr s_CreateOrderedOrgCommentList (TCommentLocPtr org_clp)
02776 {
02777 TCommentLocPtr clp, prev_clp, next_clp, clp_list, ordered_start;
02778 int next_len, this_len, len;
02779
02780 if (org_clp == NULL) {
02781 return NULL;
02782 }
02783
02784 clp_list = s_FindComment (org_clp->start);
02785 prev_clp = NULL;
02786 ordered_start = s_FindComment (clp_list->end);
02787 if (s_IsOrganismComment (ordered_start))
02788 {
02789 s_CommentLocFree (ordered_start);
02790 ordered_start = NULL;
02791 }
02792 if (ordered_start == NULL) {
02793 return clp_list;
02794 }
02795 clp = s_FindComment (ordered_start->end);
02796 while (clp != NULL && clp->start < org_clp->end) {
02797
02798 prev_clp = NULL;
02799 next_clp = ordered_start;
02800 next_len = next_clp->end - next_clp->start;
02801 this_len = clp->end - clp->start;
02802 len = next_len > this_len ? next_len : this_len;
02803 while (next_clp != NULL
02804 && strncmp (next_clp->start, clp->start, len) < 0)
02805 {
02806 prev_clp = next_clp;
02807 next_clp = next_clp->next;
02808 if (next_clp != NULL) {
02809 next_len = next_clp->end - next_clp->start;
02810 len = next_len > this_len ? next_len : this_len;
02811 }
02812 }
02813 if (prev_clp == NULL) {
02814 clp->next = ordered_start;
02815 ordered_start = clp;
02816 } else {
02817 clp->next = prev_clp->next;
02818 prev_clp->next = clp;
02819 }
02820 clp = s_FindComment (clp->end);
02821 }
02822 clp_list->next = ordered_start;
02823 return clp_list;
02824 }
02825
02826
02827
02828
02829
02830 static char * s_CreateOrderedOrgName (TCommentLocPtr org_clp)
02831 {
02832 TCommentLocPtr clp, clp_list;
02833 char * ordered_org_name;
02834 char * cp;
02835
02836 if (org_clp == NULL) {
02837 return NULL;
02838 }
02839
02840 ordered_org_name = (char *)malloc (org_clp->end - org_clp->start + 2);
02841 if (ordered_org_name == NULL) {
02842 return NULL;
02843 }
02844 ordered_org_name [0] = 0;
02845 clp_list = s_CreateOrderedOrgCommentList (org_clp);
02846 cp = ordered_org_name;
02847 for (clp = clp_list; clp != NULL; clp = clp->next) {
02848 strncpy (cp, clp->start, clp->end - clp->start + 1);
02849 cp += clp->end - clp->start + 1;
02850 *cp = 0;
02851 }
02852
02853 s_CommentLocFree (clp_list);
02854
02855 return ordered_org_name;
02856 }
02857
02858 static void s_AddDeflineFromOrganismLine
02859 (char *defline,
02860 int line_num,
02861 int defline_offset,
02862 SAlignRawFilePtr afrp)
02863 {
02864 TLineInfoPtr lip;
02865 int org_num, defline_num, new_len;
02866 char *empty_defline, *new_defline;
02867
02868 if (afrp == NULL || defline == NULL) {
02869 return;
02870 }
02871
02872
02873
02874 lip = afrp->organisms;
02875 org_num = 0;
02876 while (lip != NULL)
02877 {
02878 org_num++;
02879 lip = lip->next;
02880 }
02881
02882 lip = afrp->deflines;
02883 defline_num = 0;
02884 while (lip != NULL && defline_num < org_num) {
02885 lip = lip->next;
02886 defline_num ++;
02887 }
02888
02889 if (defline_num == org_num) {
02890
02891 if (strlen (lip->data) == 0)
02892 {
02893 free (lip->data);
02894 lip->data = defline;
02895 }
02896 else
02897 {
02898
02899 new_len = strlen (lip->data) + strlen (defline) + 2;
02900 new_defline = (char *) malloc (new_len * sizeof (char));
02901 if (new_defline != NULL)
02902 {
02903 strcpy (new_defline, lip->data);
02904 strcat (new_defline, " ");
02905 strcat (new_defline, defline);
02906 free (lip->data);
02907 lip->data = new_defline;
02908 free (defline);
02909 defline = NULL;
02910 }
02911 }
02912
02913 lip->line_num = line_num;
02914 lip->line_offset = defline_offset;
02915 lip->delete_me = eFalse;
02916 }
02917 else
02918 {
02919
02920 while (defline_num < org_num - 1)
02921 {
02922 empty_defline = (char *) malloc (sizeof (char));
02923 if (empty_defline != NULL)
02924 {
02925 *empty_defline = 0;
02926 afrp->deflines = s_AddLineInfo (afrp->deflines,
02927 empty_defline, 0,
02928 0);
02929 afrp->num_deflines ++;
02930 }
02931 defline_num++;
02932 }
02933
02934 afrp->deflines = s_AddLineInfo (afrp->deflines, defline,
02935 line_num, defline_offset);
02936 afrp->num_deflines ++;
02937 }
02938 }
02939
02940
02941
02942
02943 static void s_ReadOrgNamesFromText
02944 (char * string,
02945 int line_num,
02946 SAlignRawFilePtr afrp)
02947 {
02948 TCommentLocPtr clp;
02949 char * org_name;
02950 char * cp;
02951 char * defline;
02952 char * comment_end;
02953 int defline_offset;
02954
02955 if (string == NULL || afrp == NULL) {
02956 return;
02957 }
02958
02959 clp = s_FindOrganismComment (string);
02960 if (clp == NULL && (strstr (string, "org=") != NULL || strstr (string, "organism=") != NULL))
02961 {
02962 s_ReportOrgCommentError (string, afrp->report_error, afrp->report_error_userdata);
02963 }
02964 while (clp != NULL) {
02965 org_name = s_CreateOrderedOrgName (clp);
02966 afrp->organisms = s_AddLineInfo (afrp->organisms, org_name, line_num,
02967 clp->start - string);
02968 free (org_name);
02969 afrp->num_organisms ++;
02970 defline = NULL;
02971 defline_offset = 0;
02972 if (*clp->end != 0) {
02973 cp = clp->end + 1;
02974 cp += strspn (cp, " \t\r\n");
02975 if (*cp != 0) {
02976 defline = clp->end + 1;
02977 defline_offset = clp->end - string + 1;
02978 }
02979 }
02980 s_AddDeflineFromOrganismLine (defline, line_num, defline_offset, afrp);
02981
02982 comment_end = clp->end;
02983 s_CommentLocFree (clp);
02984 clp = s_FindOrganismComment (comment_end);
02985 }
02986 }
02987
02988
02989
02990
02991
02992
02993
02994
02995
02996
02997
02998
02999 static TAlignRawSeqPtr s_AlignRawSeqNew (TAlignRawSeqPtr list)
03000 {
03001 TAlignRawSeqPtr arsp, last;
03002
03003 arsp = (TAlignRawSeqPtr)malloc (sizeof (SAlignRawSeq));
03004 if (arsp == NULL) {
03005 return NULL;
03006 }
03007 arsp->id = NULL;
03008 arsp->sequence_data = NULL;
03009 arsp->id_lines = NULL;
03010 arsp->next = NULL;
03011
03012 last = list;
03013 while (last != NULL && last->next != NULL) {
03014 last = last->next;
03015 }
03016 if (last != NULL) {
03017 last->next = arsp;
03018 }
03019 return arsp;
03020 }
03021
03022
03023
03024
03025
03026 static void s_AlignRawSeqFree (TAlignRawSeqPtr arsp)
03027 {
03028 if (arsp == NULL) {
03029 return;
03030 }
03031 s_AlignRawSeqFree (arsp->next);
03032 free (arsp->id);
03033 s_LineInfoFree (arsp->sequence_data);
03034 s_IntLinkFree (arsp->id_lines);
03035 free (arsp);
03036 }
03037
03038
03039
03040
03041
03042
03043 static TAlignRawSeqPtr
03044 s_FindAlignRawSeqById
03045 (TAlignRawSeqPtr list,
03046 char * id)
03047 {
03048 TAlignRawSeqPtr arsp;
03049
03050 for (arsp = list; arsp != NULL; arsp = arsp->next) {
03051 if (strcmp (arsp->id, id) == 0) {
03052 return arsp;
03053 }
03054 }
03055 return NULL;
03056 }
03057
03058
03059
03060
03061
03062
03063 static int
03064 s_FindAlignRawSeqOffsetById
03065 (TAlignRawSeqPtr list,
03066 char * id)
03067 {
03068 TAlignRawSeqPtr arsp;
03069 int offset;
03070
03071 for (arsp = list, offset = 0; arsp != NULL; arsp = arsp->next, offset++) {
03072 if (strcmp (arsp->id, id) == 0) {
03073 return offset;
03074 }
03075 }
03076 return -1;
03077 }
03078
03079
03080
03081
03082
03083
03084 static char *
03085 s_GetAlignRawSeqIDByOffset
03086 (TAlignRawSeqPtr list,
03087 int offset)
03088 {
03089 TAlignRawSeqPtr arsp;
03090 int index;
03091
03092 arsp = list;
03093 index = 0;
03094 while ( arsp != NULL && index != offset ) {
03095 arsp = arsp->next;
03096 index++;
03097 }
03098 if (index == offset && arsp != NULL) {
03099 return arsp->id;
03100 } else {
03101 return NULL;
03102 }
03103 }
03104
03105
03106
03107
03108
03109
03110
03111 static TAlignRawSeqPtr
03112 s_AddAlignRawSeqById
03113 (TAlignRawSeqPtr list,
03114 char * id,
03115 char * data,
03116 int id_line_num,
03117 int data_line_num,
03118 int data_line_offset)
03119 {
03120 TAlignRawSeqPtr arsp;
03121 TIntLinkPtr ilp;
03122
03123 arsp = s_FindAlignRawSeqById (list, id);
03124 if (arsp == NULL) {
03125 arsp = s_AlignRawSeqNew (list);
03126 if (arsp == NULL) {
03127 return NULL;
03128 }
03129 if (list == NULL) list = arsp;
03130 arsp->id = strdup (id);
03131 }
03132 arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
03133 data,
03134 data_line_num,
03135 data_line_offset);
03136 ilp = s_IntLinkNew (id_line_num, arsp->id_lines);
03137 if (arsp->id_lines == NULL) arsp->id_lines = ilp;
03138 return list;
03139 }
03140
03141
03142
03143
03144
03145
03146 static EBool
03147 s_AddAlignRawSeqByIndex
03148 (TAlignRawSeqPtr list,
03149 int index,
03150 char * data,
03151 int data_line_num,
03152 int data_line_offset)
03153 {
03154 TAlignRawSeqPtr arsp;
03155 int curr;
03156
03157 curr = 0;
03158 for (arsp = list; arsp != NULL && curr < index; arsp = arsp->next) {
03159 curr++;
03160 }
03161 if (arsp == NULL) {
03162 return eFalse;
03163 } else {
03164 arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
03165 data,
03166 data_line_num,
03167 data_line_offset);
03168 return eTrue;
03169 }
03170 }
03171
03172
03173
03174
03175 static void s_AlignFileRawFree (SAlignRawFilePtr afrp)
03176 {
03177 if (afrp == NULL) {
03178 return;
03179 }
03180
03181 s_LineInfoFree (afrp->organisms);
03182 s_LineInfoFree (afrp->deflines);
03183 s_LineInfoFree (afrp->line_list);
03184 s_AlignRawSeqFree (afrp->sequences);
03185 s_IntLinkFree (afrp->offset_list);
03186 free (afrp->alphabet);
03187 free (afrp);
03188 }
03189
03190
03191
03192
03193
03194
03195 static SAlignRawFilePtr s_AlignFileRawNew (void)
03196 {
03197 SAlignRawFilePtr afrp;
03198
03199 afrp = (SAlignRawFilePtr)malloc (sizeof (SAlignRawFileData));
03200 if (afrp == NULL) {
03201 return NULL;
03202 }
03203 afrp->marked_ids = eFalse;
03204 afrp->line_list = NULL;
03205 afrp->organisms = NULL;
03206 afrp->num_organisms = 0;
03207 afrp->deflines = NULL;
03208 afrp->num_deflines = 0;
03209 afrp->block_size = 0;
03210 afrp->offset_list = NULL;
03211 afrp->sequences = NULL;
03212 afrp->report_error = NULL;
03213 afrp->report_error_userdata = NULL;
03214 afrp->alphabet = NULL;
03215 afrp->expected_num_sequence = 0;
03216 afrp->expected_sequence_len = 0;
03217 afrp->num_segments = 1;
03218 afrp->align_format_found = eFalse;
03219 return afrp;
03220 }
03221
03222
03223
03224
03225
03226
03227
03228
03229
03230
03231
03232
03233
03234
03235
03236
03237
03238
03239
03240
03241
03242
03243 static SLengthListPtr s_GetBlockPattern (char * cp)
03244 {
03245 SLengthListPtr this_pattern;
03246 int len;
03247
03248 this_pattern = s_LengthListNew (NULL);
03249 if (this_pattern == NULL) {
03250 return NULL;
03251 }
03252
03253 this_pattern->num_appearances = 1;
03254 while (*cp != 0) {
03255 len = strcspn (cp, " \t\r");
03256 s_AddLengthRepeat (this_pattern, len);
03257 cp += len;
03258 cp += strspn (cp, " \t\r");
03259 }
03260 return this_pattern;
03261 }
03262
03263
03264
03265
03266
03267
03268 static int
03269 s_ForecastBlockPattern
03270 (SLengthListPtr pattern_list,
03271 TIntLinkPtr next_offset,
03272 int line_start,
03273 int block_size)
03274 {
03275 int line_counter;
03276 SLengthListPtr llp;
03277
03278 line_counter = line_start;
03279 if (next_offset != NULL
03280 && next_offset->ival - line_counter < block_size) {
03281 return -1;
03282 }
03283
03284 for (llp = pattern_list;
03285 llp != NULL
03286 && (next_offset == NULL || line_counter < next_offset->ival - 1)
03287 && line_counter - line_start < block_size;
03288 llp = llp->next)
03289 {
03290 if (llp->lengthrepeats == NULL) {
03291 return -1;
03292 }
03293 line_counter += llp->num_appearances;
03294 }
03295 if (line_counter - line_start == block_size) {
03296
03297
03298
03299
03300 if (llp == NULL
03301 || llp->lengthrepeats == NULL) {
03302 return line_start;
03303 }
03304 }
03305 return -1;
03306 }
03307
03308
03309
03310
03311
03312
03313 static TIntLinkPtr
03314 s_AugmentBlockPatternOffsetList
03315 (SLengthListPtr pattern_list,
03316 TIntLinkPtr offset_list,
03317 int block_size)
03318 {
03319 int line_counter;
03320 SLengthListPtr llp;
03321 TIntLinkPtr next_offset, prev_offset, new_offset;
03322 int forecast_pos;
03323
03324 prev_offset = NULL;
03325 next_offset = offset_list;
03326 line_counter = 0;
03327 llp = pattern_list;
03328 while (llp != NULL) {
03329 if (next_offset != NULL && line_counter == next_offset->ival) {
03330 prev_offset = next_offset;
03331 next_offset = next_offset->next;
03332
03333 while (line_counter - prev_offset->ival < block_size
03334 && llp != NULL)
03335 {
03336 line_counter += llp->num_appearances;
03337 llp = llp->next;
03338 }
03339 } else {
03340 forecast_pos = s_ForecastBlockPattern (llp, next_offset,
03341 line_counter,
03342 block_size);
03343 if (forecast_pos > 0) {
03344 new_offset = s_IntLinkNew (forecast_pos, NULL);
03345 if (new_offset == NULL) {
03346 return NULL;
03347 }
03348 if (prev_offset == NULL) {
03349 new_offset->next = offset_list;
03350 offset_list = new_offset;
03351 } else {
03352 new_offset->next = next_offset;
03353 prev_offset->next = new_offset;
03354 }
03355 prev_offset = new_offset;
03356
03357 while (line_counter - prev_offset->ival < block_size
03358 && llp != NULL)
03359 {
03360 line_counter += llp->num_appearances;
03361 llp = llp->next;
03362 }
03363 } else {
03364 line_counter += llp->num_appearances;
03365 llp = llp->next;
03366 }
03367 }
03368 }
03369 return offset_list;
03370 }
03371
03372
03373
03374
03375
03376
03377 static EBool
03378 s_FindUnusedLines
03379 (SLengthListPtr pattern_list,
03380 SAlignRawFilePtr afrp)
03381 {
03382 TIntLinkPtr offset;
03383 SLengthListPtr llp;
03384 int line_counter;
03385 int block_line_counter;
03386 EBool rval = eFalse;
03387 TLineInfoPtr line_val;
03388 int skip;
03389
03390 if (pattern_list == NULL || afrp == NULL
03391 || afrp->offset_list == NULL || afrp->block_size < 2) {
03392 return eFalse;
03393 }
03394
03395 offset = afrp->offset_list;
03396 llp = pattern_list;
03397 line_counter = 0;
03398 line_val = afrp->line_list;
03399
03400 while (llp != NULL && line_val != NULL) {
03401 while (llp != NULL && line_val != NULL
03402 && (offset == NULL || line_counter < offset->ival)) {
03403 if (llp->lengthrepeats != NULL) {
03404 s_ReportUnusedLine (line_counter,
03405 line_counter + llp->num_appearances - 1,
03406 line_val,
03407 afrp->report_error,
03408 afrp->report_error_userdata);
03409 if (offset != afrp->offset_list) {
03410 rval = eTrue;
03411 }
03412 }
03413 line_counter += llp->num_appearances;
03414 for (skip = 0;
03415 skip < llp->num_appearances && line_val != NULL;
03416 skip++) {
03417 line_val = line_val->next;
03418 }
03419 llp = llp->next;
03420 }
03421 block_line_counter = 0;
03422 while (block_line_counter < afrp->block_size && llp != NULL) {
03423 block_line_counter += llp->num_appearances;
03424 line_counter += llp->num_appearances;
03425 for (skip = 0;
03426 skip < llp->num_appearances && line_val != NULL;
03427 skip++) {
03428 line_val = line_val->next;
03429 }
03430 llp = llp->next;
03431 }
03432 if (offset != NULL) {
03433 offset = offset->next;
03434 }
03435 }
03436 return rval;
03437 }
03438
03439
03440
03441
03442
03443
03444 static void
03445 s_FindInterleavedBlocks
03446 (SLengthListPtr pattern_list,
03447 SAlignRawFilePtr afrp)
03448 {
03449 SLengthListPtr llp, llp_next;
03450 TSizeInfoPtr size_list, best_ptr;
03451 TIntLinkPtr new_offset;
03452 int line_counter;
03453
03454 afrp->block_size = 0;
03455 size_list = NULL;
03456 afrp->offset_list = NULL;
03457 for (llp = pattern_list; llp != NULL; llp = llp->next) {
03458 llp_next = llp->next;
03459 if (llp->num_appearances > 1
03460 && (llp_next == NULL || llp_next->lengthrepeats == NULL)) {
03461 size_list = s_AddSizeInfo (size_list, llp->num_appearances);
03462 }
03463 }
03464 best_ptr = s_GetMostPopularSizeInfo (size_list);
03465 if (best_ptr != NULL
03466 && (best_ptr->num_appearances > 1 ||
03467 (size_list->next == NULL && size_list->size_value > 1))) {
03468 afrp->block_size = best_ptr->size_value;
03469 line_counter = 0;
03470 for (llp = pattern_list; llp != NULL; llp = llp->next) {
03471 llp_next = llp->next;
03472 if (llp->num_appearances == afrp->block_size
03473 && (llp_next == NULL || llp_next->lengthrepeats == NULL))
03474 {
03475 new_offset = s_IntLinkNew (line_counter, afrp->offset_list);
03476 if (new_offset == NULL) {
03477 return;
03478 }
03479 if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
03480 }
03481 line_counter += llp->num_appearances;
03482 }
03483 afrp->offset_list = s_AugmentBlockPatternOffsetList (pattern_list,
03484 afrp->offset_list,
03485 afrp->block_size);
03486 }
03487 if (s_FindUnusedLines (pattern_list, afrp)) {
03488 s_IntLinkFree (afrp->offset_list);
03489 afrp->offset_list = NULL;
03490 afrp->block_size = 0;
03491 } else {
03492 afrp->align_format_found = eTrue;
03493 }
03494 s_SizeInfoFree (size_list);
03495
03496 }
03497
03498 static void s_TrimEndSpace (char *linestring)
03499 {
03500 int len;
03501 char *cp;
03502
03503 if (linestring == NULL) return;
03504 len = strlen (linestring);
03505 cp = linestring + len - 1;
03506 while (cp > linestring && (*cp == ' ' || *cp == '\t' || *cp == '\r' || *cp == '\n'))
03507 {
03508 *cp = 0;
03509 cp--;
03510 }
03511 }
03512
03513 static SAlignRawFilePtr
03514 s_ReadAlignFileRaw
03515 (FReadLineFunction readfunc,
03516 void * userdata,
03517 TSequenceInfoPtr sequence_info,
03518 EBool use_nexus_file_info,
03519 FReportErrorFunction errfunc,
03520 void * errdata)
03521 {
03522 char * linestring;
03523 SAlignRawFilePtr afrp;
03524 char * tmp;
03525 EBool found_stop;
03526 int overall_line_count;
03527 EBool found_expected_ntax = eFalse;
03528 EBool found_expected_nchar = eFalse;
03529 EBool found_char_comment = eFalse;
03530 SLengthListPtr pattern_list = NULL;
03531 SLengthListPtr this_pattern, last_pattern = NULL;
03532 char * cp;
03533 int len;
03534 TIntLinkPtr new_offset;
03535 EBool in_taxa_comment;
03536 EBool in_bracketed_comment = eFalse;
03537 TBracketedCommentListPtr comment_list = NULL, last_comment = NULL;
03538 EBool last_line_was_marked_id = eFalse;
03539 TLineInfoPtr last_line = NULL, next_line;
03540
03541 if (readfunc == NULL || sequence_info == NULL) {
03542 return NULL;
03543 }
03544
03545 afrp = s_AlignFileRawNew ();
03546 if (afrp == NULL) {
03547 return NULL;
03548 }
03549
03550 afrp->alphabet = strdup (sequence_info->alphabet);
03551 afrp->report_error = errfunc;
03552 afrp->report_error_userdata = errdata;
03553
03554 overall_line_count = 0;
03555 found_stop = eFalse;
03556 in_taxa_comment = eFalse;
03557 linestring = readfunc (userdata);
03558 if (s_IsASN1 (linestring)) {
03559 s_ReportASN1Error (afrp->report_error, afrp->report_error_userdata);
03560 s_AlignFileRawFree (afrp);
03561 return NULL;
03562 }
03563
03564 while (linestring != NULL && linestring [0] != EOF) {
03565 s_TrimEndSpace (linestring);
03566 s_ReadOrgNamesFromText (linestring, overall_line_count, afrp);
03567
03568
03569
03570
03571
03572 len = strspn (linestring, " \t\r\n");
03573 tmp = strdup (linestring + len);
03574 if (tmp == NULL) {
03575 return NULL;
03576 }
03577
03578 if (! found_stop && ! in_taxa_comment) {
03579 found_stop = s_FoundStopLine (tmp);
03580 }
03581 if (! found_stop) {
03582 if (! found_expected_ntax || ! found_expected_nchar) {
03583 if (s_IsTwoNumbersSeparatedBySpace (tmp)) {
03584 s_GetFASTAExpectedNumbers (tmp, afrp);
03585 found_expected_ntax = eTrue;
03586 found_expected_nchar = eTrue;
03587 afrp->align_format_found = eTrue;
03588 } else {
03589 s_GetNexusSizeComments (tmp, &found_expected_ntax,
03590 &found_expected_nchar, afrp);
03591 }
03592 }
03593 if (! found_char_comment) {
03594 if (use_nexus_file_info) {
03595 found_char_comment = s_UpdateNexusCharInfo (tmp, sequence_info);
03596 } else {
03597 found_char_comment = s_CheckNexusCharInfo (tmp, sequence_info,
03598 afrp->report_error,
03599 afrp->report_error_userdata);
03600 }
03601 }
03602 if (in_taxa_comment) {
03603 if (strncmp (tmp, "end;", 4) == 0) {
03604 in_taxa_comment = eFalse;
03605 }
03606 tmp [0] = 0;
03607 } else if (strncmp (tmp, "begin taxa;", 11) == 0) {
03608 tmp [0] = 0;
03609 in_taxa_comment = eTrue;
03610 afrp->align_format_found = eTrue;
03611 }
03612
03613
03614
03615 s_RemoveCommentFromLine (tmp);
03616
03617 if (in_bracketed_comment) {
03618 len = strspn (linestring, " \t\r\n");
03619 if (last_comment != NULL)
03620 {
03621 s_BracketedCommentListAddLine (last_comment, linestring + len,
03622 overall_line_count, len);
03623 }
03624 if (strchr (tmp, ']') != NULL) {
03625 in_bracketed_comment = eFalse;
03626 }
03627 tmp [0] = 0;
03628 } else if (tmp [0] == '[' && strchr (tmp, ']') == NULL) {
03629 in_bracketed_comment = eTrue;
03630 len = strspn (linestring, " \t\r\n");
03631 last_comment = s_BracketedCommentListNew (comment_list,
03632 linestring + len,
03633 overall_line_count, len);
03634 if (comment_list == NULL)
03635 {
03636 comment_list = last_comment;
03637 }
03638 tmp [0] = 0;
03639 }
03640
03641 if (!afrp->align_format_found && s_IsAlnFormatString (tmp)) {
03642 afrp->align_format_found = eTrue;
03643 }
03644
03645 if (s_SkippableString (tmp)) {
03646 tmp [0] = 0;
03647 }
03648 if (tmp [0] == '>' && ! found_stop) {
03649
03650
03651
03652
03653 if (last_line_was_marked_id)
03654 {
03655 afrp->marked_ids = eFalse;
03656 }
03657 else
03658 {
03659 afrp->marked_ids = eTrue;
03660 }
03661 new_offset = s_IntLinkNew (overall_line_count + 1,
03662 afrp->offset_list);
03663 if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
03664 last_line_was_marked_id = eTrue;
03665 } else {
03666 last_line_was_marked_id = eFalse;
03667
03668 len = strcspn (tmp, " \t\r");
03669 if (len > 0) {
03670 cp = tmp + len;
03671 len = strspn (cp, " \t\r");
03672 if (len > 0) {
03673 cp = cp + len;
03674 }
03675 if (*cp == 0) {
03676 this_pattern = s_GetBlockPattern (tmp);
03677 } else {
03678 this_pattern = s_GetBlockPattern (cp);
03679 }
03680 } else {
03681 this_pattern = s_GetBlockPattern (tmp);
03682 }
03683
03684 if (last_pattern == NULL) {
03685 pattern_list = this_pattern;
03686 last_pattern = this_pattern;
03687 } else if (s_DoLengthPatternsMatch (last_pattern, this_pattern)) {
03688 last_pattern->num_appearances ++;
03689 s_LengthListFree (this_pattern);
03690 } else {
03691 last_pattern->next = this_pattern;
03692 last_pattern = this_pattern;
03693 }
03694 }
03695
03696 len = strspn (linestring, " \t\r\n");
03697
03698 next_line = s_LineInfoNew (linestring + len, overall_line_count, len);
03699
03700 if (last_line == NULL) {
03701 afrp->line_list = next_line;
03702 } else {
03703 last_line->next = next_line;
03704 }
03705 last_line = next_line;
03706 }
03707 free (linestring);
03708 free (tmp);
03709 linestring = readfunc (userdata);
03710 overall_line_count ++;
03711 }
03712 afrp->num_segments = s_GetNumSegmentsInAlignment (comment_list, errfunc, errdata);
03713 if (afrp->num_segments > 1)
03714 {
03715 if (afrp->offset_list != NULL)
03716 {
03717 s_ReportSegmentedAlignmentError (afrp->offset_list,
03718 errfunc, errdata);
03719 s_AlignFileRawFree (afrp);
03720 s_LengthListFree (pattern_list);
03721 s_BracketedCommentListFree (comment_list);
03722 return NULL;
03723 }
03724 else
03725 {
03726 afrp->offset_list = GetSegmentOffsetList (comment_list);
03727 afrp->marked_ids = eTrue;
03728 }
03729 }
03730 if (! afrp->marked_ids) {
03731 s_FindInterleavedBlocks (pattern_list, afrp);
03732 }
03733 s_LengthListFree (pattern_list);
03734 s_BracketedCommentListFree (comment_list);
03735 return afrp;
03736 }
03737
03738
03739
03740
03741
03742
03743
03744
03745 static EBool
03746 s_DoesBlockHaveIds
03747 (SAlignRawFilePtr afrp,
03748 TLineInfoPtr first_line,
03749 int num_lines_in_block)
03750 {
03751 TLineInfoPtr lip;
03752 char * linestring;
03753 char * this_id;
03754 TAlignRawSeqPtr arsp;
03755 size_t len;
03756 int block_offset;
03757
03758 if (afrp->sequences == NULL) {
03759 return eTrue;
03760 }
03761
03762 for (lip = first_line, block_offset = 0;
03763 lip != NULL && block_offset < num_lines_in_block;
03764 lip = lip->next, block_offset++)
03765 {
03766 linestring = lip->data;
03767 if (linestring != NULL) {
03768 len = strcspn (linestring, " \t\r");
03769 if (len > 0 && len < strlen (linestring)) {
03770 this_id = (char *) malloc (len + 1);
03771 if (this_id == NULL) {
03772 return eFalse;
03773 }
03774 strncpy (this_id, linestring, len);
03775 this_id [len] = 0;
03776 arsp = s_FindAlignRawSeqById (afrp->sequences, this_id);
03777 free (this_id);
03778 if (arsp != NULL) {
03779 return eTrue;
03780 }
03781 }
03782 }
03783 }
03784 return eFalse;
03785 }
03786
03787
03788
03789
03790
03791
03792
03793 static EBool
03794 s_BlockIsConsistent
03795 (SAlignRawFilePtr afrp,
03796 TLineInfoPtr first_line,
03797 int num_lines_in_block,
03798 EBool has_ids,
03799 EBool first_block)
03800 {
03801 TLineInfoPtr lip;
03802 SLengthListPtr list, this_pattern, best;
03803 int len, block_offset, id_offset;
03804 char * tmp_id;
03805 EBool rval;
03806 char * cp;
03807
03808 rval = eTrue;
03809 list = NULL;
03810 for (lip = first_line, block_offset = 0;
03811 lip != NULL && block_offset < num_lines_in_block;
03812 lip = lip->next, block_offset ++)
03813 {
03814 cp = lip->data;
03815 if (has_ids) {
03816 len = strcspn (cp, " \t\r");
03817 if (first_block && len == strlen (cp)) {
03818
03819
03820
03821
03822 len = 10;
03823 }
03824 tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
03825 if (tmp_id == NULL) {
03826 return eFalse;
03827 }
03828 strncpy (tmp_id, cp, len);
03829 tmp_id [len] = 0;
03830 id_offset = s_FindAlignRawSeqOffsetById (afrp->sequences, tmp_id);
03831 if (id_offset != block_offset && ! first_block) {
03832 rval = eFalse;
03833 s_ReportInconsistentID (tmp_id, lip->line_num,
03834 afrp->report_error,
03835 afrp->report_error_userdata);
03836 }
03837 free (tmp_id);
03838 cp += len;
03839 cp += strspn (cp, " \t\r");
03840 }
03841 this_pattern = s_GetBlockPattern (cp);
03842 list = s_AddLengthList (list, this_pattern);
03843 }
03844
03845
03846 best = NULL;
03847 for (this_pattern = list;
03848 this_pattern != NULL;
03849 this_pattern = this_pattern->next)
03850 {
03851 if (this_pattern->num_appearances == 0) continue;
03852 if (best == NULL
03853 || this_pattern->num_appearances > best->num_appearances)
03854 {
03855 best = this_pattern;
03856 }
03857 }
03858
03859
03860 for (lip = first_line, block_offset = 0;
03861 lip != NULL && block_offset < num_lines_in_block;
03862 lip = lip->next, block_offset ++)
03863 {
03864 cp = lip->data;
03865 if (has_ids) {
03866 len = strcspn (cp, " \t\r");
03867 if (first_block && len == strlen (cp)) {
03868
03869
03870
03871
03872 len = 10;
03873 }
03874 tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
03875 if (tmp_id == NULL) {
03876 return eFalse;
03877 }
03878 strncpy (tmp_id, cp, len);
03879 tmp_id [len] = 0;
03880 cp += len;
03881 cp += strspn (cp, " \t\r");
03882 } else {
03883 tmp_id = s_GetAlignRawSeqIDByOffset (afrp->sequences, block_offset);
03884 }
03885 this_pattern = s_GetBlockPattern (cp);
03886 if ( ! s_DoLengthPatternsMatch (this_pattern, best)) {
03887 rval = eFalse;
03888 s_ReportInconsistentBlockLine (tmp_id, lip->line_num,
03889 afrp->report_error,
03890 afrp->report_error_userdata);
03891 }
03892 s_LengthListFree (this_pattern);
03893 if (has_ids) {
03894 free (tmp_id);
03895 }
03896 }
03897 s_LengthListFree (list);
03898 return rval;
03899 }
03900
03901
03902
03903
03904
03905 static void
03906 s_ProcessBlockLines
03907 (SAlignRawFilePtr afrp,
03908 TLineInfoPtr lines,
03909 int num_lines_in_block,
03910 EBool first_block)
03911 {
03912 TLineInfoPtr lip;
03913 char * linestring;
03914 char * cp;
03915 char * this_id;
03916 int len;
03917 int line_number;
03918 EBool this_block_has_ids;
03919 int pos;
03920 TAlignRawSeqPtr arsp;
03921
03922 this_block_has_ids = s_DoesBlockHaveIds (afrp, lines, num_lines_in_block);
03923 s_BlockIsConsistent (afrp, lines, num_lines_in_block, this_block_has_ids,
03924 first_block);
03925 for (lip = lines, line_number = 0;
03926 lip != NULL && line_number < num_lines_in_block;
03927 lip = lip->next, line_number ++)
03928 {
03929 linestring = lip->data;
03930 if (linestring != NULL) {
03931 pos = 0;
03932 if (this_block_has_ids) {
03933 len = strcspn (linestring, " \t\r");
03934 if (first_block && len == strlen (linestring)) {
03935
03936
03937
03938
03939 len = 10;
03940 }
03941 this_id = (char *) malloc (len + 1);
03942 if (this_id == NULL) {
03943 return;
03944 }
03945 strncpy (this_id, linestring, len);
03946 this_id [len] = 0;
03947 cp = linestring + len;
03948 pos += len;
03949 len = strspn (cp, " \t\r");
03950 cp += len;
03951 pos += len;
03952
03953 if (first_block)
03954 {
03955 arsp = s_FindAlignRawSeqById (afrp->sequences, this_id);
03956 if (arsp != NULL)
03957 {
03958 s_ReportDuplicateIDError (this_id, lip->line_num,
03959 afrp->report_error,
03960 afrp->report_error_userdata);
03961 }
03962 }
03963 afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
03964 this_id, cp,
03965 lip->line_num,
03966 lip->line_num,
03967 lip->line_offset + cp - linestring);
03968 free (this_id);
03969 } else {
03970 if (! s_AddAlignRawSeqByIndex (afrp->sequences, line_number,
03971 linestring,
03972 lip->line_num, lip->line_offset))
03973 {
03974 s_ReportBlockLengthError ("", lip->line_num,
03975 afrp->block_size,
03976 line_number,
03977 afrp->report_error,
03978 afrp->report_error_userdata);
03979 }
03980 }
03981 }
03982 }
03983 }
03984
03985
03986
03987
03988
03989 static void
03990 s_RemoveCommentsFromBlock
03991 (TLineInfoPtr first_line,
03992 int num_lines_in_block)
03993 {
03994 TLineInfoPtr lip;
03995 int block_offset;
03996
03997 for (lip = first_line, block_offset = 0;
03998 lip != NULL && block_offset < num_lines_in_block;
03999 lip = lip->next)
04000 {
04001 s_RemoveCommentFromLine (lip->data);
04002 }
04003 }
04004
04005
04006
04007
04008
04009 static void s_ProcessAlignRawFileByBlockOffsets (SAlignRawFilePtr afrp)
04010 {
04011 int line_counter;
04012 TIntLinkPtr offset_ptr;
04013 TLineInfoPtr lip;
04014 EBool first_block = eTrue;
04015 EBool in_taxa_comment = eFalse;
04016
04017 if (afrp == NULL) {
04018 return;
04019 }
04020
04021 line_counter = 0;
04022 offset_ptr = afrp->offset_list;
04023 lip = afrp->line_list;
04024 while (lip != NULL && offset_ptr != NULL
04025 && (in_taxa_comment || ! s_FoundStopLine (lip->data))) {
04026 if (in_taxa_comment) {
04027 if (strncmp (lip->data, "end;", 4) == 0) {
04028 in_taxa_comment = eFalse;
04029 }
04030 } else if (lip->data != NULL
04031 && strncmp (lip->data, "begin taxa;", 11) == 0) {
04032 in_taxa_comment = eTrue;
04033 }
04034 if (line_counter == offset_ptr->ival) {
04035 s_RemoveCommentsFromBlock (lip, afrp->block_size);
04036 s_ProcessBlockLines (afrp, lip, afrp->block_size, first_block);
04037 first_block = eFalse;
04038 offset_ptr = offset_ptr->next;
04039 }
04040 lip = lip->next;
04041 line_counter ++;
04042 }
04043 }
04044
04045
04046
04047
04048 static void
04049 s_CreateSequencesBasedOnTokenPatterns
04050 (TLineInfoPtr token_list,
04051 TIntLinkPtr offset_list,
04052 SLengthListPtr * anchorpattern,
04053 SAlignRawFilePtr afrp)
04054 {
04055 TLineInfoPtr lip;
04056 int line_counter;
04057 TIntLinkPtr offset_ptr, next_offset_ptr;
04058 char * curr_id;
04059 TSizeInfoPtr sip;
04060 int pattern_line_counter;
04061 int curr_seg;
04062
04063 if (token_list == NULL || offset_list == NULL
04064 || anchorpattern == NULL
04065 || afrp == NULL)
04066 {
04067 return;
04068 }
04069 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04070 {
04071 if (anchorpattern [curr_seg] == NULL || anchorpattern [curr_seg]->lengthrepeats == NULL)
04072 {
04073 return;
04074 }
04075 }
04076
04077 line_counter = 0;
04078 lip = token_list;
04079 offset_ptr = offset_list;
04080 curr_seg = 0;
04081
04082 for (offset_ptr = offset_list;
04083 offset_ptr != NULL && lip != NULL;
04084 offset_ptr = offset_ptr->next)
04085 {
04086 next_offset_ptr = offset_ptr->next;
04087 while (line_counter < offset_ptr->ival - 1 && lip != NULL) {
04088 lip = lip->next;
04089 line_counter ++;
04090 }
04091 if (lip != NULL) {
04092 curr_id = lip->data;
04093 lip = lip->next;
04094 line_counter ++;
04095 for (sip = anchorpattern[curr_seg]->lengthrepeats;
04096 sip != NULL
04097 && lip != NULL
04098 && (next_offset_ptr == NULL
04099 || line_counter < next_offset_ptr->ival - 1);
04100 sip = sip->next)
04101 {
04102 for (pattern_line_counter = 0;
04103 pattern_line_counter < sip->num_appearances
04104 && lip != NULL
04105 && (next_offset_ptr == NULL
04106 || line_counter < next_offset_ptr->ival - 1);
04107 pattern_line_counter ++)
04108 {
04109 if (lip->data [0] != ']' && lip->data [0] != '[') {
04110 if ((int) strlen (lip->data) != sip->size_value) {
04111 s_ReportLineLengthError (curr_id, lip,
04112 sip->size_value,
04113 afrp->report_error,
04114 afrp->report_error_userdata);
04115 }
04116 afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
04117 curr_id,
04118 lip->data,
04119 lip->line_num,
04120 lip->line_num,
04121 lip->line_offset);
04122 }
04123 lip = lip->next;
04124 line_counter ++;
04125 }
04126 }
04127 if (sip != NULL && lip != NULL) {
04128 s_ReportBlockLengthError (curr_id, lip->line_num,
04129 afrp->block_size,
04130 line_counter - offset_ptr->ival,
04131 afrp->report_error,
04132 afrp->report_error_userdata);
04133 }
04134 }
04135 curr_seg ++;
04136 if (curr_seg >= afrp->num_segments)
04137 {
04138 curr_seg = 0;
04139 }
04140 }
04141 }
04142
04143
04144
04145
04146
04147
04148
04149
04150
04151
04152
04153
04154
04155
04156
04157
04158 static SLengthListPtr *
04159 s_CreateAnchorPatternForMarkedIDs
04160 (SAlignRawFilePtr afrp)
04161 {
04162 SLengthListPtr * list;
04163 SLengthListPtr * best;
04164 SLengthListPtr this_pattern;
04165 char * cp;
04166 TLineInfoPtr lip;
04167 int curr_seg;
04168
04169 if (afrp == NULL) {
04170 return NULL;
04171 }
04172
04173
04174 list = (SLengthListPtr *) malloc (afrp->num_segments * sizeof (SLengthListPtr));
04175 if (list == NULL)
04176 {
04177 return NULL;
04178 }
04179 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04180 {
04181 list[curr_seg] = NULL;
04182 }
04183
04184
04185 best = (SLengthListPtr *) malloc ((afrp->num_segments + 1) * sizeof (SLengthListPtr));
04186 if (best == NULL)
04187 {
04188 return NULL;
04189 }
04190 for (curr_seg = 0; curr_seg < afrp->num_segments + 1; curr_seg ++)
04191 {
04192 best[curr_seg] = NULL;
04193 }
04194
04195
04196 this_pattern = NULL;
04197
04198 curr_seg = 0;
04199 for (lip = afrp->line_list;
04200 lip != NULL && ! s_FoundStopLine (lip->data);
04201 lip = lip->next)
04202 {
04203 if (lip->data == NULL) continue;
04204 if (lip->data [0] == ']' || lip->data [0] == '[') continue;
04205 if (lip->data [0] == '>') {
04206 if (this_pattern != NULL) {
04207 list [curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
04208 curr_seg ++;
04209 if (curr_seg >= afrp->num_segments)
04210 {
04211 curr_seg = 0;
04212 }
04213 }
04214 this_pattern = s_LengthListNew (NULL);
04215 if (this_pattern == NULL) {
04216 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04217 {
04218 s_LengthListFree (list [curr_seg]);
04219 }
04220 free (list);
04221 return NULL;
04222 }
04223 this_pattern->num_appearances = 1;
04224 } else if (this_pattern != NULL) {
04225
04226 cp = lip->data;
04227 while ( isspace ((unsigned char)*cp) || isdigit ((unsigned char)*cp)) {
04228 cp++;
04229 }
04230 s_AddLengthRepeat (this_pattern, strlen (cp));
04231 }
04232 }
04233 if (this_pattern != NULL) {
04234 list[curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
04235 }
04236
04237
04238 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg++)
04239 {
04240 for (this_pattern = list [curr_seg];
04241 this_pattern != NULL;
04242 this_pattern = this_pattern->next)
04243 {
04244 if (this_pattern->num_appearances == 0) continue;
04245 if (best [curr_seg] == NULL
04246 || this_pattern->num_appearances > best[curr_seg]->num_appearances)
04247 {
04248 best[curr_seg] = this_pattern;
04249 }
04250
04251 }
04252
04253
04254 if (best [curr_seg] != NULL) {
04255 s_LengthListFree (best [curr_seg]->next);
04256 best [curr_seg]->next = NULL;
04257 }
04258
04259 if (best [curr_seg] != list [curr_seg]) {
04260 this_pattern = list [curr_seg];
04261 while ( this_pattern != NULL && this_pattern->next != best[curr_seg] ) {
04262 this_pattern = this_pattern->next;
04263 }
04264 if (this_pattern != NULL) {
04265 this_pattern->next = NULL;
04266 s_LengthListFree (list [curr_seg]);
04267 }
04268 }
04269 }
04270
04271 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04272 {
04273 if (best[curr_seg] == NULL)
04274 {
04275 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
04276 {
04277 s_LengthListFree (best [curr_seg]);
04278 }
04279 return NULL;
04280 }
04281 }
04282
04283 return best;
04284 }
04285
04286
04287
04288
04289
04290 static void s_RemoveBasePairCountCommentsFromData (SAlignRawFilePtr afrp)
04291 {
04292 TIntLinkPtr this_offset, next_offset;
04293 TLineInfoPtr lip;
04294 int line_count;
04295 char * cp;
04296
04297 if (afrp == NULL || afrp->offset_list == NULL) {
04298 return;
04299 }
04300 this_offset = afrp->offset_list;
04301 next_offset = this_offset->next;
04302 lip = afrp->line_list;
04303 line_count = 0;
04304 while (lip != NULL && this_offset != NULL) {
04305 if (line_count == this_offset->ival) {
04306 while (lip != NULL &&
04307 (next_offset == NULL
04308 || line_count < next_offset->ival - 1)) {
04309 cp = lip->data;
04310 if (cp != NULL) {
04311 cp += strspn (cp, " \t\r\n1234567890");
04312 if (cp != lip->data) {
04313 strcpy (lip->data, cp);
04314 }
04315 }
04316 line_count ++;
04317 lip = lip->next;
04318 }
04319 this_offset = this_offset->next;
04320 if (this_offset != NULL) {
04321 next_offset = this_offset->next;
04322 }
04323 } else {
04324 line_count ++;
04325 lip = lip->next;
04326 }
04327 }
04328 }
04329
04330
04331
04332
04333
04334
04335
04336
04337 static void s_ProcessAlignFileRawForMarkedIDs (SAlignRawFilePtr afrp)
04338 {
04339 SLengthListPtr * anchorpattern;
04340
04341 if (afrp == NULL) {
04342 return;
04343 }
04344
04345 s_RemoveBasePairCountCommentsFromData (afrp);
04346 anchorpattern = s_CreateAnchorPatternForMarkedIDs (afrp);
04347 if (anchorpattern == NULL || afrp->offset_list == NULL) {
04348 return;
04349 }
04350 s_CreateSequencesBasedOnTokenPatterns (afrp->line_list, afrp->offset_list,
04351 anchorpattern, afrp);
04352 }
04353
04354
04355
04356
04357
04358
04359
04360 static void
04361 s_StringLeftShift
04362 (char * cp_from,
04363 char * cp_to)
04364 {
04365 if (cp_from == cp_to || cp_from == NULL || cp_to == NULL) {
04366 return;
04367 }
04368 while (*cp_to != 0) {
04369 *cp_from = *cp_to;
04370 cp_from++;
04371 cp_to++;
04372 }
04373 *cp_from = 0;
04374 }
04375
04376
04377
04378
04379
04380
04381 static TLineInfoPtr s_RemoveCommentsFromTokens (TLineInfoPtr list)
04382 {
04383 TLineInfoPtr lip;
04384 int num_comment_starts;
04385 char * cp_l;
04386 char * cp_r;
04387 char * cp;
04388 EBool in_comment;
04389
04390 num_comment_starts = 0;
04391 in_comment = eFalse;
04392 for (lip = list; lip != NULL; lip = lip->next) {
04393 if (lip->data == NULL) {
04394 lip->delete_me = eTrue;
04395 } else {
04396 cp_l = NULL;
04397 cp_r = NULL;
04398 for (cp = lip->data; *cp != 0; cp++) {
04399 if (*cp == ']') {
04400 if (cp_r == NULL) {
04401 s_StringLeftShift (lip->data, cp + 1);
04402 cp = lip->data - 1;
04403 } else {
04404 s_StringLeftShift (cp_r, cp + 1);
04405 cp = cp_r;
04406 if (cp_r > lip->data) {
04407 cp_r --;
04408 while (cp_r >= lip->data && *cp_r != '[') {
04409 cp_r --;
04410 }
04411 if (cp_r < lip->data) {
04412 cp_r = NULL;
04413 }
04414 } else {
04415 cp_r = NULL;
04416 }
04417 }
04418 if (num_comment_starts > 0) {
04419 num_comment_starts --;
04420 }
04421 } else if (*cp == '[') {
04422 cp_r = cp;
04423 num_comment_starts ++;
04424 }
04425 }
04426 if (in_comment) {
04427 if (num_comment_starts == 0) {
04428 in_comment = eFalse;
04429 } else {
04430 lip->delete_me = eTrue;
04431 }
04432 } else if (num_comment_starts > 0) {
04433 cp_r = strchr (lip->data, '[');
04434 if (cp_r != NULL) {
04435 *cp_r = 0;
04436 }
04437 in_comment = eTrue;
04438 }
04439 if (lip->data [0] == 0) {
04440 lip->delete_me = eTrue;
04441 }
04442 }
04443 }
04444 list = s_DeleteLineInfos (list);
04445 return list;
04446 }
04447
04448
04449
04450
04451
04452
04453 static TLineInfoPtr s_RemoveNexusCommentsFromTokens (TLineInfoPtr list)
04454 {
04455 TLineInfoPtr lip, start_lip, end_lip;
04456
04457 lip = list;
04458 start_lip = NULL;
04459 end_lip = NULL;
04460 while (lip != NULL) {
04461 if (s_StringICmp (lip->data, "#NEXUS") == 0) {
04462 start_lip = lip;
04463 end_lip = lip;
04464 while (end_lip != NULL
04465 && s_StringICmp (end_lip->data, "matrix") != 0) {
04466 end_lip = end_lip->next;
04467 }
04468 if (end_lip != NULL) {
04469 while (start_lip != end_lip) {
04470 start_lip->delete_me = eTrue;
04471 start_lip = start_lip->next;
04472 }
04473 end_lip->delete_me = eTrue;
04474 lip = end_lip->next;
04475 } else {
04476 lip = lip->next;
04477 }
04478 } else {
04479 lip = lip->next;
04480 }
04481 }
04482 list = s_DeleteLineInfos (list);
04483 return list;
04484 }
04485
04486
04487
04488
04489
04490
04491 static TSizeInfoPtr
04492 s_FindMostFrequentlyOccurringTokenLength
04493 (TSizeInfoPtr list,
04494 int not_this_size)
04495 {
04496 TSizeInfoPtr list_ptr, new_list, best_ptr, return_best;
04497
04498 new_list = NULL;
04499 for (list_ptr = list; list_ptr != NULL; list_ptr = list_ptr->next) {
04500 if (not_this_size != list_ptr->size_value) {
04501 new_list = s_AddSizeInfoAppearances (new_list,
04502 list_ptr->size_value,
04503 list_ptr->num_appearances);
04504 }
04505 }
04506 best_ptr = s_GetMostPopularSizeInfo (new_list);
04507 return_best = NULL;
04508 if (best_ptr != NULL) {
04509 return_best = s_SizeInfoNew (NULL);
04510 if (return_best != NULL) {
04511 return_best->size_value = best_ptr->size_value;
04512 return_best->num_appearances = best_ptr->num_appearances;
04513 }
04514 }
04515 s_SizeInfoFree (new_list);
04516 return return_best;
04517 }
04518
04519
04520
04521
04522
04523
04524
04525
04526
04527
04528
04529 static void
04530 s_ExtendAnchorPattern
04531 (SLengthListPtr anchorpattern,
04532 TSizeInfoPtr line_lengths)
04533 {
04534 TSizeInfoPtr last_line_lengths, sip, sip_next, twoafter;
04535 int best_last_line_length;
04536 int anchor_line_length;
04537
04538 if (anchorpattern == NULL
04539 || anchorpattern->lengthrepeats == NULL
04540 || line_lengths == NULL) {
04541 return;
04542 }
04543
04544 last_line_lengths = NULL;
04545 anchor_line_length = anchorpattern->lengthrepeats->size_value;
04546
04547
04548
04549
04550
04551 for (sip = line_lengths; sip != NULL; sip = sip->next) {
04552 if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
04553 sip_next = sip->next;
04554 if (sip_next != NULL
04555 && sip_next->size_value > 0
04556 && sip_next->size_value != anchor_line_length
04557 && ((twoafter = sip_next->next) == NULL
04558 || twoafter->size_value != anchor_line_length))
04559 {
04560 last_line_lengths = s_AddSizeInfo (last_line_lengths,
04561 sip_next->size_value);
04562 }
04563 }
04564 }
04565 best_last_line_length = s_GetMostPopularSize (last_line_lengths);
04566 if (best_last_line_length > 0) {
04567 s_AddLengthRepeat (anchorpattern, best_last_line_length);
04568 }
04569 s_SizeInfoFree (last_line_lengths);
04570 }
04571
04572
04573
04574
04575
04576
04577
04578
04579
04580 static SLengthListPtr s_FindMostPopularPattern (TSizeInfoPtr list)
04581 {
04582 SLengthListPtr patternlist, newpattern;
04583 TSizeInfoPtr sip, popular_line_length;
04584 SLengthListPtr index, best;
04585 int not_this_length;
04586
04587 patternlist = NULL;
04588 for (sip = list; sip != NULL; sip = sip->next) {
04589 if (sip->size_value > 0) {
04590 newpattern = s_LengthListNew (NULL);
04591 if (newpattern == NULL) {
04592 s_LengthListFree (patternlist);
04593 return NULL;
04594 }
04595 newpattern->num_appearances = 1;
04596 newpattern->lengthrepeats = s_SizeInfoNew (NULL);
04597 if (newpattern->lengthrepeats == NULL) {
04598 s_LengthListFree (patternlist);
04599 return NULL;
04600 }
04601 newpattern->lengthrepeats->size_value = sip->size_value;
04602 newpattern->lengthrepeats->num_appearances = sip->num_appearances;
04603 patternlist = s_AddLengthList (patternlist, newpattern);
04604 }
04605 }
04606 if (patternlist == NULL) {
04607 return NULL;
04608 }
04609
04610 best = NULL;
04611 for (index = patternlist; index != NULL; index = index->next) {
04612 if (index->lengthrepeats->num_appearances < 2) {
04613 continue;
04614 }
04615 if (best==NULL || best->num_appearances < index->num_appearances) {
04616 best = index;
04617 } else if (best->num_appearances == index->num_appearances
04618 && best->lengthrepeats->size_value <
04619 index->lengthrepeats->size_value) {
04620 best = index;
04621 }
04622 }
04623
04624
04625 index = patternlist;
04626 while ( index != NULL && index->next != best ) {
04627 index = index->next;
04628 }
04629 if (index != NULL) {
04630 index->next = NULL;
04631 s_LengthListFree (patternlist);
04632 }
04633
04634 if (best != NULL) {
04635 s_LengthListFree (best->next);
04636 best->next = NULL;
04637 }
04638
04639 popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list, 0);
04640
04641 if (best != NULL && best->lengthrepeats != NULL
04642 && popular_line_length != NULL
04643 && best->lengthrepeats->size_value == popular_line_length->size_value)
04644 {
04645 not_this_length = popular_line_length->size_value;
04646 s_SizeInfoFree (popular_line_length);
04647 popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list,
04648 not_this_length);
04649 }
04650
04651 if (best == NULL
04652 || (popular_line_length != NULL
04653 && popular_line_length->size_value > best->lengthrepeats->size_value
04654 && popular_line_length->num_appearances > best->num_appearances))
04655 {
04656 if (best == NULL) {
04657 best = s_LengthListNew (NULL);
04658 if (best == NULL) {
04659 return NULL;
04660 }
04661 }
04662 best->lengthrepeats = s_SizeInfoNew (NULL);
04663 if (best->lengthrepeats == NULL) {
04664 return NULL;
04665 }
04666 best->lengthrepeats->size_value = popular_line_length->size_value;
04667 best->lengthrepeats->num_appearances = 1;
04668 } else {
04669
04670 s_ExtendAnchorPattern (best, list);
04671 }
04672
04673 s_SizeInfoFree (popular_line_length);
04674
04675 return best;
04676 }
04677
04678
04679
04680
04681
04682
04683 static TIntLinkPtr
04684 s_CreateOffsetList
04685 (TSizeInfoPtr list,
04686 SLengthListPtr anchorpattern)
04687 {
04688 int line_counter;
04689 TIntLinkPtr offset_list, new_offset;
04690 TSizeInfoPtr sip, prev_sip;
04691
04692 if (list == NULL || anchorpattern == NULL) {
04693 return NULL;
04694 }
04695 line_counter = 0;
04696 offset_list = NULL;
04697 prev_sip = NULL;
04698 for (sip = list; sip != NULL; sip = sip->next) {
04699 if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
04700 new_offset = s_IntLinkNew (line_counter, offset_list);
04701 if (new_offset == NULL) {
04702 s_IntLinkFree (offset_list);
04703 return NULL;
04704 }
04705 if (offset_list == NULL) {
04706 offset_list = new_offset;
04707 }
04708 }
04709
04710 line_counter += sip->num_appearances;
04711 prev_sip = sip;
04712 }
04713 return offset_list;
04714 }
04715
04716
04717
04718
04719
04720
04721
04722
04723 static int
04724 s_ForecastPattern
04725 (int line_start,
04726 int pattern_length,
04727 TIntLinkPtr next_offset,
04728 int sip_offset,
04729 TSizeInfoPtr list)
04730 {
04731 int offset, end_offset;
04732 TSizeInfoPtr sip;
04733 int line_counter, num_chars;
04734
04735 if (list == NULL) {
04736 return -1;
04737 }
04738
04739 for (offset = sip_offset; offset < list->num_appearances; offset++) {
04740 line_counter = line_start + offset;
04741 num_chars = list->size_value * (list->num_appearances - offset);
04742 sip = list;
04743 while (num_chars < pattern_length
04744 && (next_offset == NULL || line_counter < next_offset->ival)
04745 && sip->next != NULL)
04746 {
04747 sip = sip->next;
04748 for (end_offset = 0;
04749 end_offset < sip->num_appearances
04750 && num_chars < pattern_length
04751 && (next_offset == NULL
04752 || line_counter < next_offset->ival);
04753 end_offset ++)
04754 {
04755 num_chars += sip->size_value;
04756 line_counter ++;
04757 }
04758 }
04759 if (num_chars == pattern_length) {
04760 return line_start + offset;
04761 }
04762 }
04763 return -1;
04764 }
04765
04766
04767
04768
04769
04770
04771
04772 static TIntLinkPtr
04773 s_AugmentOffsetList
04774 (TIntLinkPtr offset_list,
04775 TSizeInfoPtr list,
04776 SLengthListPtr anchorpattern)
04777 {
04778 int pattern_length;
04779 TSizeInfoPtr sip;
04780 TIntLinkPtr prev_offset, next_offset, new_offset;
04781 int line_counter, forecast_position, line_skip;
04782 EBool skipped_previous = eFalse;
04783 int num_chars;
04784 int num_additional_offsets = 0;
04785 int max_additional_offsets = 5000;
04786
04787 if (list == NULL || anchorpattern == NULL) {
04788 return offset_list;
04789 }
04790
04791 pattern_length = 0;
04792 for (sip = anchorpattern->lengthrepeats; sip != NULL; sip = sip->next) {
04793 pattern_length += (sip->size_value * sip->num_appearances);
04794 }
04795 if (pattern_length == 0) {
04796 return offset_list;
04797 }
04798
04799 prev_offset = NULL;
04800 next_offset = offset_list;
04801 line_counter = 0;
04802 sip = list;
04803 while (sip != NULL && num_additional_offsets < max_additional_offsets) {
04804
04805 if (next_offset != NULL && line_counter > next_offset->ival) {
04806 next_offset = next_offset->next;
04807 } else if (next_offset != NULL && line_counter == next_offset->ival) {
04808 skipped_previous = eFalse;
04809 prev_offset = next_offset;
04810 next_offset = next_offset->next;
04811
04812 num_chars = 0;
04813 while (num_chars < pattern_length && sip != NULL) {
04814 num_chars += sip->size_value * sip->num_appearances;
04815 line_counter += sip->num_appearances;
04816 sip = sip->next;
04817 }
04818 } else if (skipped_previous) {
04819 line_skip = 0;
04820 while (sip != NULL && line_skip < sip->num_appearances
04821 && num_additional_offsets < max_additional_offsets
04822 && (next_offset == NULL
04823 || line_counter < next_offset->ival)) {
04824
04825
04826
04827 forecast_position = s_ForecastPattern (line_counter,
04828 pattern_length,
04829 next_offset, line_skip,
04830 sip);
04831 if (forecast_position > 0) {
04832 new_offset = s_IntLinkNew (forecast_position, NULL);
04833 num_additional_offsets++;
04834 if (new_offset == NULL) {
04835 return NULL;
04836 }
04837 if (prev_offset == NULL) {
04838 new_offset->next = offset_list;
04839 offset_list = new_offset;
04840 } else {
04841 new_offset->next = next_offset;
04842 prev_offset->next = new_offset;
04843 }
04844 prev_offset = new_offset;
04845
04846
04847
04848 num_chars = 0;
04849 while (num_chars < pattern_length && sip != NULL) {
04850 for (line_skip = 0;
04851 line_skip < sip->num_appearances
04852 && num_chars < pattern_length;
04853 line_skip++)
04854 {
04855 num_chars += sip->size_value;
04856 line_counter ++;
04857 }
04858 if (line_skip == sip->num_appearances) {
04859 sip = sip->next;
04860 line_skip = 0;
04861 }
04862 }
04863 } else {
04864 line_counter += sip->num_appearances;
04865 sip = sip->next;
04866 line_skip = 0;
04867 }
04868 }
04869 } else {
04870 skipped_previous = eTrue;
04871 line_counter += sip->num_appearances;
04872 sip = sip->next;
04873 }
04874 }
04875 if (num_additional_offsets >= max_additional_offsets)
04876 {
04877 s_IntLinkFree (offset_list);
04878 offset_list = NULL;
04879 }
04880 return offset_list;
04881 }
04882
04883
04884
04885
04886
04887 static int s_GetMostPopularPatternLength (TIntLinkPtr offset_list)
04888 {
04889 int line_counter, best_length;
04890 TSizeInfoPtr pattern_length_list;
04891 TIntLinkPtr offset;
04892
04893 if (offset_list == NULL) {
04894 return -1;
04895 }
04896
04897 line_counter = -1;
04898 pattern_length_list = NULL;
04899 for (offset = offset_list; offset != NULL; offset = offset->next) {
04900 if (line_counter != -1) {
04901 pattern_length_list = s_AddSizeInfo (pattern_length_list,
04902 offset->ival - line_counter);
04903 }
04904 line_counter = offset->ival;
04905 }
04906 best_length = s_GetMostPopularSize (pattern_length_list);
04907 s_SizeInfoFree (pattern_length_list);
04908 return best_length;
04909 }
04910
04911
04912
04913
04914
04915 static int
04916 s_GetBestCharacterLength
04917 (TLineInfoPtr token_list,
04918 TIntLinkPtr offset_list,
04919 int block_length)
04920 {
04921 TLineInfoPtr lip;
04922 TIntLinkPtr prev_offset, new_offset;
04923 int line_diff, num_chars, best_num_chars;
04924 TSizeInfoPtr pattern_length_list = NULL;
04925
04926 if (token_list == NULL || offset_list == NULL || block_length < 1) {
04927 return -1;
04928 }
04929
04930 lip = token_list;
04931 prev_offset = NULL;
04932 for (new_offset = offset_list;
04933 new_offset != NULL && lip != NULL;
04934 new_offset = new_offset->next)
04935 {
04936 if (prev_offset == NULL) {
04937
04938 for (line_diff = 0;
04939 line_diff < new_offset->ival && lip != NULL;
04940 line_diff ++)
04941 {
04942 lip = lip->next;
04943 }
04944 }
04945 if (prev_offset != NULL) {
04946 num_chars = 0;
04947 for (line_diff = 0;
04948 line_diff < new_offset->ival - prev_offset->ival
04949 && lip != NULL;
04950 line_diff ++)
04951 {
04952 if (line_diff < new_offset->ival - prev_offset->ival - 1) {
04953 num_chars += strlen (lip->data);
04954 }
04955 lip = lip->next;
04956 }
04957 if (new_offset->ival - prev_offset->ival == block_length) {
04958 pattern_length_list = s_AddSizeInfo (pattern_length_list,
04959 num_chars);
04960 }
04961 }
04962 prev_offset = new_offset;
04963 }
04964 best_num_chars = s_GetMostPopularSize (pattern_length_list);
04965 if (best_num_chars == 0 && pattern_length_list != NULL) {
04966 best_num_chars = pattern_length_list->size_value;
04967 }
04968 s_SizeInfoFree (pattern_length_list);
04969 pattern_length_list = NULL;
04970 return best_num_chars;
04971 }
04972
04973
04974 static int
04975 s_CountCharactersBetweenOffsets
04976 (TLineInfoPtr list,
04977 int distance,
04978 int desired_num_chars)
04979 {
04980 int line_diff, num_chars, total_chars, pattern_length, num_starts;
04981 TLineInfoPtr lip;
04982 TIntLinkPtr length_list, start_list, start_ptr, length;
04983 int start_of_unknown;
04984 int num_additional_offsets_needed;
04985
04986 if (list == NULL || distance == 0 || desired_num_chars == 0) {
04987 return 0;
04988 }
04989
04990
04991
04992
04993
04994 total_chars = 0;
04995 for (lip = list, line_diff = 0;
04996 lip != NULL && line_diff < distance
04997 && total_chars < desired_num_chars;
04998 lip = lip->next, line_diff++) {
04999 num_chars = strlen (lip->data);
05000 total_chars += num_chars;
05001 }
05002 while (lip != NULL && line_diff < distance && s_IsBlank (lip->data)) {
05003 lip = lip->next;
05004 line_diff ++;
05005 }
05006
05007 if (lip != NULL) {
05008 lip = lip->next;
05009 line_diff ++;
05010 }
05011
05012 if (lip == NULL || line_diff == distance) {
05013 return 0;
05014 }
05015 num_starts = 1;
05016 list = lip->next;
05017 start_of_unknown = line_diff;
05018
05019 length_list = NULL;
05020 total_chars = 0;
05021 for (lip = list;
05022 lip != NULL && line_diff < distance;
05023 lip = lip->next, line_diff++)
05024 {
05025 num_chars = strlen (lip->data);
05026 length = s_IntLinkNew (num_chars, length_list);
05027 if (length_list == NULL) {
05028 length_list = length;
05029 }
05030 total_chars += num_chars;
05031 }
05032
05033
05034 num_additional_offsets_needed = (total_chars / desired_num_chars);
05035 if (num_additional_offsets_needed == 0) {
05036 return 0;
05037 }
05038
05039
05040
05041
05042 start_list = NULL;
05043 num_starts = 0;
05044 pattern_length = 0;
05045 for (start_ptr = length_list, line_diff = start_of_unknown;
05046 start_ptr != NULL && line_diff < distance
05047 && pattern_length < distance - line_diff ;
05048 start_ptr = start_ptr->next, line_diff++) {
05049 num_chars = start_ptr->ival;
05050 pattern_length = 1;
05051 length = start_ptr->next;
05052 while (num_chars < desired_num_chars
05053 && pattern_length + line_diff < distance
05054 && length != NULL)
05055 {
05056 num_chars += length->ival;
05057 pattern_length ++;
05058 length = length->next;
05059 }
05060 if (num_chars == desired_num_chars) {
05061 length = s_IntLinkNew (line_diff, start_list);
05062 if (start_list == NULL) {
05063 start_list = length;
05064 }
05065 num_starts ++;
05066 }
05067 }
05068
05069
05070
05071 s_IntLinkFree (length_list);
05072 s_IntLinkFree (start_list);
05073 return 0;
05074 }
05075
05076
05077
05078
05079
05080 static void s_InsertNewOffsets
05081 (TLineInfoPtr token_list,
05082 TIntLinkPtr offset_list,
05083 int block_length,
05084 int best_num_chars,
05085 char * alphabet)
05086 {
05087 TLineInfoPtr lip, prev_start;
05088 TIntLinkPtr prev_offset, new_offset, splice_offset;
05089 int line_diff, num_chars, line_start;
05090
05091 if (token_list == NULL || offset_list == NULL
05092 || block_length < 1 || best_num_chars < 1)
05093 {
05094 return;
05095 }
05096
05097 lip = token_list;
05098 prev_offset = NULL;
05099 for (new_offset = offset_list;
05100 new_offset != NULL && lip != NULL;
05101 new_offset = new_offset->next) {
05102 if (prev_offset == NULL) {
05103
05104 for (line_diff = 0;
05105 line_diff < new_offset->ival && lip != NULL;
05106 line_diff ++) {
05107 lip = lip->next;
05108 }
05109 } else {
05110 if (new_offset->ival - prev_offset->ival == block_length) {
05111
05112 for (line_diff = 0;
05113 line_diff < new_offset->ival - prev_offset->ival
05114 && lip != NULL;
05115 line_diff ++) {
05116 lip = lip->next;
05117 }
05118 } else {
05119
05120 prev_start = lip;
05121 num_chars = 0;
05122 for (line_diff = 0;
05123 line_diff < new_offset->ival - prev_offset->ival
05124 && lip != NULL && num_chars < best_num_chars;
05125 line_diff ++) {
05126 num_chars += strlen (lip->data);
05127 lip = lip->next;
05128 }
05129 if (lip == NULL) {
05130 return;
05131 }
05132
05133 line_diff ++;
05134 lip = lip->next;
05135 if (line_diff < new_offset->ival - prev_offset->ival) {
05136 line_start = line_diff + prev_offset->ival;
05137
05138 while (line_diff < new_offset->ival - prev_offset->ival
05139 && lip != NULL)
05140 {
05141 lip = lip->next;
05142 line_diff ++;
05143 }
05144
05145 splice_offset = s_IntLinkNew (line_start, NULL);
05146 if (splice_offset == NULL) {
05147 return;
05148 }
05149 splice_offset->next = new_offset;
05150 prev_offset->next = splice_offset;
05151
05152 s_CountCharactersBetweenOffsets (lip,
05153 new_offset->ival - splice_offset->ival,
05154 best_num_chars);
05155 }
05156 }
05157 }
05158 prev_offset = new_offset;
05159 }
05160
05161
05162 for (line_diff = 0;
05163 line_diff < block_length && lip != NULL;
05164 line_diff ++) {
05165 lip = lip->next;
05166 }
05167
05168
05169 if (lip != NULL && ! s_SkippableString (lip->data)) {
05170 splice_offset = s_IntLinkNew (line_diff + prev_offset->ival, prev_offset);
05171 }
05172 }
05173
05174
05175
05176 static EBool s_ContainsDigits (char *data)
05177 {
05178 char *cp;
05179
05180 if (data == NULL) return eFalse;
05181 for (cp = data; *cp != 0; cp++) {
05182 if (isdigit ((unsigned char)(*cp))) {
05183 return eTrue;
05184 }
05185 }
05186 return eFalse;
05187 }
05188
05189
05190
05191
05192
05193
05194 static void s_ProcessAlignFileRawByLengthPattern (SAlignRawFilePtr afrp)
05195 {
05196 TLineInfoPtr token_list;
05197 SLengthListPtr list;
05198 TLineInfoPtr lip;
05199 SLengthListPtr anchorpattern[2];
05200 TIntLinkPtr offset_list;
05201 int best_length;
05202 int best_num_chars;
05203
05204 if (afrp == NULL || afrp->line_list == NULL) {
05205 return;
05206 }
05207
05208 token_list = s_BuildTokenList (afrp->line_list);
05209 token_list = s_RemoveCommentsFromTokens (token_list);
05210 token_list = s_RemoveNexusCommentsFromTokens (token_list);
05211
05212 list = s_LengthListNew ( NULL );
05213 for (lip = token_list;
05214 lip != NULL && ! s_FoundStopLine (lip->data);
05215 lip = lip->next)
05216 {
05217 if (s_SkippableString (lip->data) || s_ContainsDigits(lip->data)) {
05218 s_AddLengthRepeat (list, 0);
05219 } else {
05220 s_AddLengthRepeat (list, strlen (lip->data));
05221 }
05222 }
05223
05224 anchorpattern [0] = s_FindMostPopularPattern (list->lengthrepeats);
05225 anchorpattern [1] = NULL;
05226 if (anchorpattern [0] == NULL || anchorpattern[0]->lengthrepeats == NULL) {
05227 return;
05228 }
05229
05230
05231
05232
05233 offset_list = s_CreateOffsetList (list->lengthrepeats, anchorpattern[0]);
05234 offset_list = s_AugmentOffsetList (offset_list,
05235 list->lengthrepeats,
05236 anchorpattern[0]);
05237
05238
05239 best_length = s_GetMostPopularPatternLength (offset_list);
05240 if (best_length < 1 && offset_list != NULL && offset_list->next != NULL) {
05241 best_length = offset_list->next->ival - offset_list->ival;
05242 }
05243 best_num_chars = s_GetBestCharacterLength (token_list, offset_list,
05244 best_length);
05245 s_InsertNewOffsets (token_list, offset_list, best_length, best_num_chars,
05246 afrp->alphabet);
05247
05248
05249
05250
05251 s_CreateSequencesBasedOnTokenPatterns (token_list, offset_list,
05252 anchorpattern, afrp);
05253
05254 s_LengthListFree (anchorpattern[0]);
05255 s_LengthListFree (list);
05256 s_LineInfoFree (token_list);
05257 }
05258
05259
05260
05261
05262
05263
05264
05265
05266
05267
05268
05269
05270
05271
05272 extern TAlignmentFilePtr AlignmentFileNew (void)
05273 {
05274 TAlignmentFilePtr afp;
05275
05276 afp = (TAlignmentFilePtr) malloc (sizeof (SAlignmentFile));
05277 if (afp == NULL) {
05278 return NULL;
05279 }
05280 afp->num_sequences = 0;
05281 afp->num_organisms = 0;
05282 afp->num_deflines = 0;
05283 afp->num_segments = 0;
05284 afp->ids = NULL;
05285 afp->sequences = NULL;
05286 afp->organisms = NULL;
05287 afp->deflines = NULL;
05288 return afp;
05289 }
05290
05291
05292
05293
05294
05295 extern void AlignmentFileFree (TAlignmentFilePtr afp)
05296 {
05297 int index;
05298
05299 if (afp == NULL) {
05300 return;
05301 }
05302 if (afp->ids != NULL) {
05303 for (index = 0; index < afp->num_sequences; index++) {
05304 free (afp->ids [index]);
05305 }
05306 free (afp->ids);
05307 afp->ids = NULL;
05308 }
05309 if (afp->sequences != NULL) {
05310 for (index = 0; index < afp->num_sequences; index++) {
05311 free (afp->sequences [index]);
05312 }
05313 free (afp->sequences);
05314 afp->sequences = NULL;
05315 }
05316 if (afp->organisms != NULL) {
05317 for (index = 0; index < afp->num_organisms; index++) {
05318 free (afp->organisms [index]);
05319 }
05320 free (afp->organisms);
05321 afp->sequences = NULL;
05322 }
05323 if (afp->deflines != NULL) {
05324 for (index = 0; index < afp->num_deflines; index++) {
05325 free (afp->deflines [index]);
05326 }
05327 free (afp->deflines);
05328 afp->deflines = NULL;
05329 }
05330 free (afp);
05331 }
05332
05333
05334
05335
05336
05337
05338 static char * s_GetIdFromString (char * str)
05339 {
05340 char * cp;
05341 char * id;
05342 int len;
05343
05344 if (str == NULL) {
05345 return NULL;
05346 }
05347
05348 cp = str;
05349 cp += strspn (str, " >\t");
05350 len = strcspn (cp, " \t\r\n");
05351 if (len == 0) {
05352 return NULL;
05353 }
05354 id = (char *)malloc (len + 1);
05355 if (id == NULL) {
05356 return NULL;
05357 }
05358 strncpy (id, cp, len);
05359 id [ len ] = 0;
05360 return id;
05361 }
05362
05363
05364
05365
05366
05367 static char * s_GetDeflineFromIdString (char * str)
05368 {
05369 char * cp;
05370 int len;
05371
05372 if (str == NULL) {
05373 return NULL;
05374 }
05375
05376 cp = str;
05377 cp += strspn (str, " >\t");
05378 len = strcspn (cp, " \t\r\n");
05379 if (len == 0) {
05380 return NULL;
05381 }
05382 cp += len;
05383 len = strspn (cp, " \t\r\n");
05384 if (len == 0) {
05385 return NULL;
05386 }
05387 cp += len;
05388 if (*cp == 0) {
05389 return NULL;
05390 }
05391 return strdup (cp);
05392 }
05393
05394
05395
05396
05397
05398
05399 static EBool s_ReprocessIds (SAlignRawFilePtr afrp)
05400 {
05401 TStringCountPtr list, scp;
05402 TAlignRawSeqPtr arsp;
05403 TLineInfoPtr lip;
05404 char * id;
05405 int line_num;
05406 EBool rval = eTrue;
05407 char * defline;
05408
05409 if (afrp == NULL) {
05410 return eFalse;
05411 }
05412
05413 list = NULL;
05414 lip = afrp->deflines;
05415 for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
05416 if (arsp->id_lines != NULL) {
05417 line_num = arsp->id_lines->ival;
05418 } else {
05419 line_num = -1;
05420 }
05421 s_RemoveOrganismCommentFromLine (arsp->id);
05422 id = s_GetIdFromString (arsp->id);
05423 if (lip == NULL) {
05424 defline = s_GetDeflineFromIdString (arsp->id);
05425 afrp->deflines = s_AddLineInfo (afrp->deflines, defline,
05426 line_num, 0);
05427 free (defline);
05428 afrp->num_deflines ++;
05429 }
05430 free (arsp->id);
05431 arsp->id = id;
05432 list = s_AddStringCount (arsp->id, line_num, list);
05433 }
05434
05435 for (scp = list; scp != NULL; scp = scp->next) {
05436 if (scp->num_appearances > 1) {
05437 rval = eFalse;
05438 s_ReportRepeatedId (scp, afrp->report_error,
05439 afrp->report_error_userdata);
05440 }
05441 }
05442
05443 s_StringCountFree (list);
05444 return rval;
05445 }
05446
05447
05448
05449
05450
05451
05452
05453
05454
05455 static int
05456 s_ReportRepeatedBadCharsInSequence
05457 (TLineInfoReaderPtr lirp,
05458 char * id,
05459 char * reason,
05460 FReportErrorFunction report_error,
05461 void * report_error_userdata)
05462 {
05463 int bad_line_num, bad_line_offset;
05464 int num_bad_chars;
05465 char bad_char, curr_char;
05466 int data_position;
05467
05468 bad_line_num = s_LineInfoReaderGetCurrentLineNumber (lirp);
05469 bad_line_offset = s_LineInfoReaderGetCurrentLineOffset (lirp);
05470 bad_char = *lirp->curr_line_pos;
05471 num_bad_chars = 1;
05472 data_position = lirp->data_pos + 1;
05473 while ((curr_char = s_FindNthDataChar (lirp, data_position)) == bad_char) {
05474 num_bad_chars ++;
05475 data_position ++;
05476 }
05477 s_ReportBadCharError (id, bad_char, num_bad_chars,
05478 bad_line_offset, bad_line_num, reason,
05479 report_error, report_error_userdata);
05480 return data_position;
05481 }
05482
05483
05484
05485
05486
05487
05488
05489
05490
05491
05492
05493
05494 static EBool
05495 s_FindBadDataCharsInSequence
05496 (TAlignRawSeqPtr arsp,
05497 TAlignRawSeqPtr master_arsp,
05498 TSequenceInfoPtr sip,
05499 int num_segments,
05500 FReportErrorFunction report_error,
05501 void * report_error_userdata)
05502 {
05503 TLineInfoReaderPtr lirp, master_lirp;
05504 int data_position;
05505 int middle_start = 0;
05506 int middle_end = 0;
05507 char curr_char, master_char;
05508 EBool found_middle_start;
05509 EBool rval = eFalse;
05510 EBool match_not_in_beginning_gap;
05511 EBool match_not_in_end_gap;
05512
05513 if (arsp == NULL || master_arsp == NULL || sip == NULL) {
05514 return eTrue;
05515 }
05516 lirp = s_LineInfoReaderNew (arsp->sequence_data);
05517 if (lirp == NULL) {
05518 return eTrue;
05519 }
05520 if (arsp != master_arsp) {
05521 master_lirp = s_LineInfoReaderNew (master_arsp->sequence_data);
05522 if (master_lirp == NULL) {
05523 s_LineInfoReaderFree (lirp);
05524 return eTrue;
05525 }
05526 } else {
05527 master_lirp = NULL;
05528 }
05529
05530 if (strcspn (sip->beginning_gap, sip->match)
05531 == strlen (sip->beginning_gap)) {
05532 match_not_in_beginning_gap = eTrue;
05533 } else {
05534 match_not_in_beginning_gap = eFalse;
05535 }
05536
05537 if (strcspn (sip->end_gap, sip->match) == strlen (sip->end_gap)) {
05538 match_not_in_end_gap = eTrue;
05539 } else {
05540 match_not_in_end_gap = eFalse;
05541 }
05542
05543
05544
05545
05546 found_middle_start = eFalse;
05547 data_position = 0;
05548 curr_char = s_FindNthDataChar (lirp, data_position);
05549 while (curr_char != 0) {
05550 if (strchr (sip->alphabet, curr_char) != NULL) {
05551 if (! found_middle_start) {
05552 middle_start = data_position;
05553 found_middle_start = eTrue;
05554 }
05555 middle_end = data_position + 1;
05556 data_position ++;
05557 } else if (! found_middle_start) {
05558 if (match_not_in_beginning_gap
05559 && strchr (sip->match, curr_char) != NULL)
05560 {
05561 middle_start = data_position;
05562 found_middle_start = eTrue;
05563 middle_end = data_position + 1;
05564 data_position ++;
05565 } else if (strchr (sip->beginning_gap, curr_char) == NULL) {
05566
05567
05568 data_position = s_ReportRepeatedBadCharsInSequence (lirp,
05569 arsp->id,
05570 "expect only beginning gap characters here",
05571 report_error, report_error_userdata);
05572 rval = eTrue;
05573 } else {
05574 *lirp->curr_line_pos = '-';
05575 data_position ++;
05576 }
05577 } else {
05578 if (match_not_in_end_gap
05579 && strchr (sip->match, curr_char) != NULL)
05580 {
05581 middle_end = data_position + 1;
05582 }
05583 data_position ++;
05584 }
05585 curr_char = s_FindNthDataChar (lirp, data_position);
05586 }
05587
05588 if (! found_middle_start) {
05589 if (num_segments > 1)
05590 {
05591 return eFalse;
05592 }
05593 else
05594 {
05595 s_ReportMissingSequenceData (arsp->id,
05596 report_error, report_error_userdata);
05597 s_LineInfoReaderFree (lirp);
05598 return eTrue;
05599
05600 }
05601 }
05602
05603
05604 data_position = middle_start;
05605 while (data_position < middle_end)
05606 {
05607 curr_char = s_FindNthDataChar (lirp, data_position);
05608 while (data_position < middle_end
05609 && strchr (sip->alphabet, curr_char) != NULL) {
05610 data_position ++;
05611 curr_char = s_FindNthDataChar (lirp, data_position);
05612 }
05613 if (curr_char == 0 || data_position >= middle_end) {
05614
05615 } else if (strchr (sip->missing, curr_char) != NULL) {
05616 *lirp->curr_line_pos = 'N';
05617 data_position ++;
05618 } else if (strchr (sip->match, curr_char) != NULL) {
05619 master_char = s_FindNthDataChar (master_lirp, data_position);
05620 if (master_char == 0) {
05621
05622 if (master_arsp == arsp) {
05623 data_position = s_ReportRepeatedBadCharsInSequence (lirp,
05624 arsp->id,
05625 "can't specify match chars in first sequence",
05626 report_error, report_error_userdata);
05627 } else {
05628 data_position = s_ReportRepeatedBadCharsInSequence (lirp,
05629 arsp->id,
05630 "can't find source for match chars",
05631 report_error, report_error_userdata);
05632 }
05633 rval = eTrue;
05634 } else {
05635 *lirp->curr_line_pos = master_char;
05636 data_position ++;
05637 }
05638 } else if (strchr (sip->middle_gap, curr_char) != NULL) {
05639 *lirp->curr_line_pos = '-';
05640 data_position ++;
05641 } else {
05642
05643 data_position = s_ReportRepeatedBadCharsInSequence (lirp,
05644 arsp->id,
05645 "expect only sequence, missing, match,"
05646 " and middle gap characters here",
05647 report_error, report_error_userdata);
05648 rval = eTrue;
05649 }
05650 }
05651
05652
05653 data_position = middle_end;
05654 curr_char = s_FindNthDataChar (lirp, data_position);
05655 while (curr_char != 0) {
05656 if (strchr (sip->end_gap, curr_char) == NULL) {
05657
05658 data_position = s_ReportRepeatedBadCharsInSequence (lirp, arsp->id,
05659 "expect only end gap characters here",
05660 report_error, report_error_userdata);
05661 rval = eTrue;
05662 } else {
05663 *lirp->curr_line_pos = '-';
05664 data_position++;
05665 }
05666 curr_char = s_FindNthDataChar (lirp, data_position);
05667 }
05668 s_LineInfoReaderFree (lirp);
05669 s_LineInfoReaderFree (master_lirp);
05670 return rval;
05671 }
05672
05673
05674
05675
05676
05677
05678
05679 static EBool
05680 s_s_FindBadDataCharsInSequenceList
05681 (SAlignRawFilePtr afrp,
05682 TSequenceInfoPtr sip)
05683 {
05684 TAlignRawSeqPtr arsp;
05685 EBool rval = eFalse;
05686
05687 if (afrp == NULL || afrp->sequences == NULL) {
05688 return eTrue;
05689 }
05690 for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
05691 if (s_FindBadDataCharsInSequence (arsp, afrp->sequences, sip,
05692 afrp->num_segments,
05693 afrp->report_error,
05694 afrp->report_error_userdata)) {
05695 rval = eTrue;
05696 }
05697 }
05698 return rval;
05699 }
05700
05701
05702
05703
05704
05705
05706 static EBool s_AreOrganismsUnique (SAlignRawFilePtr afrp)
05707 {
05708 TLineInfoPtr this_org, lip;
05709 TAlignRawSeqPtr arsp;
05710 EBool are_unique;
05711
05712 if (afrp == NULL || afrp->num_organisms == 0
05713 || afrp->organisms == NULL) {
05714 return eFalse;
05715 }
05716 are_unique = eTrue;
05717 for (this_org = afrp->organisms;
05718 this_org != NULL;
05719 this_org = this_org->next) {
05720 lip = afrp->organisms;
05721 arsp = afrp->sequences;
05722 while (lip != NULL && lip != this_org
05723 && strcmp (lip->data, this_org->data) != 0 && arsp != NULL) {
05724 lip = lip->next;
05725 arsp = arsp->next;
05726 }
05727 if (lip != NULL && lip != this_org) {
05728 are_unique = eFalse;
05729 s_ReportRepeatedOrganismName (arsp->id, this_org->line_num,
05730 lip->line_num,
05731 this_org->data,
05732 afrp->report_error,
05733 afrp->report_error_userdata);
05734 }
05735 }
05736 return are_unique;
05737 }
05738
05739
05740 #if 0
05741
05742
05743
05744 static EBool s_AreDeflinesIdentical (SAlignRawFilePtr afrp)
05745 {
05746 TLineInfoPtr lip;
05747 TStringCountPtr list;
05748 EBool rval;
05749
05750 if (afrp == NULL) {
05751 return eFalse;
05752 }
05753
05754 list = NULL;
05755 for (lip = afrp->deflines; lip != NULL; lip = lip->next) {
05756 list = s_AddStringCount (lip->data, lip->line_num, list);
05757 }
05758 rval = eTrue;
05759 if (list != NULL && list->next != NULL) {
05760 rval = eFalse;
05761 s_ReportDefinitionLineMismatch (afrp->report_error,
05762 afrp->report_error_userdata);
05763 s_ReportDefinitionLines (list, afrp->report_error,
05764 afrp->report_error_userdata);
05765 }
05766 s_StringCountFree (list);
05767 return rval;
05768 }
05769 #endif
05770
05771
05772
05773
05774
05775 static TAlignmentFilePtr
05776 s_ConvertDataToOutput
05777 (SAlignRawFilePtr afrp,
05778 TSequenceInfoPtr sip)
05779 {
05780 TAlignRawSeqPtr arsp;
05781 int index;
05782 TSizeInfoPtr * lengths;
05783 int * best_length;
05784 TAlignmentFilePtr afp;
05785 TLineInfoPtr lip;
05786 int curr_seg;
05787
05788 if (afrp == NULL || sip == NULL || afrp->sequences == NULL) {
05789 return NULL;
05790 }
05791 afp = AlignmentFileNew ();
05792 if (afp == NULL) {
05793 return NULL;
05794 }
05795
05796 afp->num_organisms = afrp->num_organisms;
05797 afp->num_deflines = afrp->num_deflines;
05798 afp->num_segments = afrp->num_segments;
05799 afp->num_sequences = 0;
05800 afp->align_format_found = afrp->align_format_found;
05801 lengths = NULL;
05802
05803 for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
05804 afp->num_sequences++;
05805 }
05806
05807 if (afp->num_sequences != afrp->num_organisms
05808 && afp->num_sequences / afp->num_segments != afrp->num_organisms) {
05809 s_ReportMissingOrganismInfo (afrp->report_error,
05810 afrp->report_error_userdata);
05811 } else {
05812 s_AreOrganismsUnique (afrp);
05813 }
05814
05815 afp->sequences = (char **)malloc (afp->num_sequences
05816 * sizeof (char *));
05817 if (afp->sequences == NULL) {
05818 AlignmentFileFree (afp);
05819 return NULL;
05820 }
05821 afp->ids = (char **)malloc (afp->num_sequences * sizeof (char *));
05822 if (afp->ids == NULL) {
05823 AlignmentFileFree (afp);
05824 return NULL;
05825 }
05826 if (afp->num_organisms > 0) {
05827 afp->organisms = (char **) malloc (afp->num_organisms
05828 * sizeof (char *));
05829 if (afp->organisms == NULL) {
05830 AlignmentFileFree (afp);
05831 return NULL;
05832 }
05833 }
05834 if (afp->num_deflines > 0) {
05835 afp->deflines = (char **)malloc (afp->num_deflines
05836 * sizeof (char *));
05837 if (afp->deflines == NULL) {
05838 AlignmentFileFree (afp);
05839 return NULL;
05840 }
05841 }
05842
05843
05844 for (lip = afrp->deflines, index = 0;
05845 lip != NULL && index < afp->num_deflines;
05846 lip = lip->next, index++) {
05847 if (lip->data == NULL) {
05848 afp->deflines [index] = NULL;
05849 } else {
05850 afp->deflines [index] = strdup (lip->data);
05851 }
05852 }
05853 while (index < afp->num_deflines) {
05854 afp->deflines [index ++] = NULL;
05855 }
05856
05857
05858 for (lip = afrp->organisms, index = 0;
05859 lip != NULL && index < afp->num_organisms;
05860 lip = lip->next, index++) {
05861 afp->organisms [index] = strdup (lip->data);
05862 }
05863
05864
05865 lengths = (TSizeInfoPtr *) malloc (sizeof (TSizeInfoPtr) * afrp->num_segments);
05866 if (lengths == NULL) {
05867 AlignmentFileFree (afp);
05868 return NULL;
05869 }
05870 best_length = (int *) malloc (sizeof (int) * afrp->num_segments);
05871 if (best_length == NULL) {
05872 free (lengths);
05873 AlignmentFileFree (afp);
05874 return NULL;
05875 }
05876 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++) {
05877 lengths [curr_seg] = NULL;
05878 best_length [curr_seg] = 0;
05879 }
05880
05881
05882 curr_seg = 0;
05883 for (arsp = afrp->sequences, index = 0;
05884 arsp != NULL && index < afp->num_sequences;
05885 arsp = arsp->next, index++) {
05886 afp->sequences [index] =
05887 s_LineInfoMergeAndStripSpaces (arsp->sequence_data);
05888
05889 if (afp->sequences [index] != NULL) {
05890 lengths [curr_seg] = s_AddSizeInfo (lengths [curr_seg], strlen (afp->sequences [index]));
05891 }
05892 afp->ids [index] = strdup (arsp->id);
05893 curr_seg ++;
05894 if (curr_seg >= afrp->num_segments) {
05895 curr_seg = 0;
05896 }
05897 }
05898 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
05899 {
05900 best_length [curr_seg] = s_GetMostPopularSize (lengths [curr_seg]);
05901 if (best_length [curr_seg] == 0 && lengths [curr_seg] != NULL) {
05902 best_length [curr_seg] = lengths [curr_seg]->size_value;
05903 }
05904 }
05905
05906 curr_seg = 0;
05907 for (index = 0; index < afp->num_sequences; index++) {
05908 if (afp->sequences [index] == NULL) {
05909 s_ReportMissingSequenceData (afp->ids [index],
05910 afrp->report_error,
05911 afrp->report_error_userdata);
05912 } else if ((int) strlen (afp->sequences [index]) != best_length [curr_seg]) {
05913 s_ReportBadSequenceLength (afp->ids [index], best_length [curr_seg],
05914 strlen (afp->sequences [index]),
05915 afrp->report_error,
05916 afrp->report_error_userdata);
05917 }
05918 curr_seg ++;
05919 if (curr_seg >= afrp->num_segments) {
05920 curr_seg = 0;
05921 }
05922 }
05923
05924 if (afrp->expected_num_sequence > 0
05925 && afrp->expected_num_sequence != afp->num_sequences)
05926 {
05927 s_ReportIncorrectNumberOfSequences (afrp->expected_num_sequence,
05928 afp->num_sequences,
05929 afrp->report_error,
05930 afrp->report_error_userdata);
05931 }
05932 if (afrp->expected_sequence_len > 0
05933 && afrp->expected_sequence_len != best_length [0])
05934 {
05935 s_ReportIncorrectSequenceLength (afrp->expected_sequence_len,
05936 best_length [0],
05937 afrp->report_error,
05938 afrp->report_error_userdata);
05939 }
05940
05941 free (best_length);
05942 for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
05943 {
05944 s_SizeInfoFree (lengths [curr_seg]);
05945 }
05946 free (lengths);
05947
05948 return afp;
05949 }
05950
05951
05952
05953
05954
05955
05956
05957
05958
05959
05960
05961
05962
05963
05964
05965 extern TAlignmentFilePtr
05966 ReadAlignmentFileEx
05967 (FReadLineFunction readfunc,
05968 void * fileuserdata,
05969 FReportErrorFunction errfunc,
05970 void * erroruserdata,
05971 TSequenceInfoPtr sequence_info,
05972 int use_nexus_file_info)
05973 {
05974 SAlignRawFilePtr afrp;
05975 TAlignmentFilePtr afp;
05976 EBool use_file = eFalse;
05977
05978 if (sequence_info == NULL || sequence_info->alphabet == NULL) {
05979 return NULL;
05980 }
05981
05982 if (use_nexus_file_info != 0)
05983 {
05984 use_file = eTrue;
05985 }
05986
05987 afrp = s_ReadAlignFileRaw ( readfunc, fileuserdata, sequence_info,
05988 use_file,
05989 errfunc, erroruserdata);
05990 if (afrp == NULL) {
05991 return NULL;
05992 }
05993
05994 if (afrp->block_size > 1) {
05995 s_ProcessAlignRawFileByBlockOffsets (afrp);
05996 } else if (afrp->marked_ids) {
05997 s_ProcessAlignFileRawForMarkedIDs (afrp);
05998 } else {
05999 s_ProcessAlignFileRawByLengthPattern (afrp);
06000 }
06001
06002 s_ReprocessIds (afrp);
06003
06004 #if 0
06005
06006 s_AreDeflinesIdentical (afrp);
06007 #endif
06008
06009 if (s_s_FindBadDataCharsInSequenceList (afrp, sequence_info)) {
06010 s_AlignFileRawFree (afrp);
06011 return NULL;
06012 }
06013
06014 afp = s_ConvertDataToOutput (afrp, sequence_info);
06015 s_AlignFileRawFree (afrp);
06016
06017 return afp;
06018 }
06019
06020
06021 extern TAlignmentFilePtr
06022 ReadAlignmentFile
06023 (FReadLineFunction readfunc,
06024 void * fileuserdata,
06025 FReportErrorFunction errfunc,
06026 void * erroruserdata,
06027 TSequenceInfoPtr sequence_info)
06028 {
06029 return ReadAlignmentFileEx (readfunc, fileuserdata, errfunc, erroruserdata,
06030 sequence_info, eFalse);
06031 }
06032
06033