|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/api/aliparse.c |
source navigation diff markup identifier search freetext search file search |
1 /*=========================================================================*/
2 /* */
3 /* aliparse.c */
4 /* */
5 /*=========================================================================*/
6
7 #include <stdarg.h>
8
9 #include <aliparse.h>
10 #include <aliread.h>
11
12 /* Defined constants */
13
14 #define ALI_USE_MAYBES FALSE /* Default values for */
15 #define ALI_READ_BUFFSIZE 80 /* configuration options */
16 #define ALI_GAP_CHAR '-' /* | */
17 #define ALI_MISSING_CHAR '?' /* | */
18 #define ALI_CORRUPT_SEQ_THRESHOLD 95 /* | */
19 #define ALI_NUCL_LINE_MAX_THRESHOLD 75 /* | */
20 #define ALI_NUCL_LINE_MIN_THRESHOLD 25 /* V */
21
22 /* Data structures */
23
24 typedef struct
25 {
26 DataInfo foundInfo;
27 IdInfoPtr currentId;
28 IdInfoPtr currentDeflineId;
29 ValNodePtr lastRow;
30 Boolean hasFullLength;
31 Boolean isFirstGroup;
32 Boolean isFirstId;
33 Boolean maybesFound;
34 SeqPartPtr lastSeqPart;
35 Boolean gotAllIds;
36 Int4 idCount;
37 Int4 currentIdCount;
38 } PatternInfo, PNTR PatternInfoPtr;
39
40 /* Filewide static variables */
41
42 static AliConfigInfo s_configInfo;
43 static Boolean s_configurationSet = FALSE;
44
45 /* Function prototypes */
46
47 static void s_FreeErrorList (ErrInfoPtr errorList);
48 static void s_FreeSequenceList (SeqPartPtr seqPtr);
49 static void s_FreeIdList (IdInfoPtr idList);
50 static void s_FreeRowList (ValNodePtr rowList);
51 static void s_FreeRowList_Safe (ValNodePtr rowList);
52 static void s_DisplayRowList (ValNodePtr rowList,
53 Int2 mask);
54 static CharPtr s_GetRowIdString (ValNodePtr row);
55 static CharPtr s_GetRowSeqString (ValNodePtr row);
56 static IdInfoPtr s_ProcessMaybes (ValNodePtr rowList);
57 static int s_SegCompare(const void *i,
58 const void *j);
59 static Boolean s_IsInterleaved (ValNodePtr rowList,
60 Int2 PNTR idCount);
61 static Boolean s_ProcessInterId (CharPtr newIdStr,
62 PatternInfoPtr pattern,
63 AlignFileDataPtr fileInfoPtr,
64 Boolean isMaybe);
65 static Boolean s_ProcessInterSeq (CharPtr newSeqStr,
66 PatternInfoPtr pattern,
67 AlignFileDataPtr fileInfoPtr,
68 Boolean isMaybe);
69 static Boolean s_AnalyzeInterleaved (ValNodePtr rowList,
70 AlignFileDataPtr fileInfoPtr,
71 Int2 idCount);
72 static Boolean s_ProcessContigId (CharPtr newIdStr,
73 PatternInfoPtr pattern,
74 AlignFileDataPtr fileInfoPtr);
75 static Boolean s_ProcessContigSeq (CharPtr newSeqStr,
76 PatternInfoPtr pattern,
77 AlignFileDataPtr fileInfoPtr);
78 static Boolean s_AnalyzeContiguous (ValNodePtr rowList,
79 AlignFileDataPtr fileInfoPtr);
80 static Boolean s_AnalyzeContents (ValNodePtr rowList,
81 AlignFileDataPtr fileInfoPtr);
82 static void s_SortErrors (AlignFileDataPtr fileInfoPtr);
83 static void s_AnalyzeErrors (AlignFileDataPtr fileInfoPtr);
84 static Boolean s_CheckContext (ValNodePtr rowList,
85 AlignFileDataPtr fileInfoPtr);
86
87
88 /*=========================================================================*/
89 /* */
90 /* Ali_GetConfig () -- Get the current configuration settings. */
91 /* */
92 /*=========================================================================*/
93
94 AliConfigInfoPtr Ali_GetConfig (void)
95 {
96
97 AliConfigInfoPtr configPtr;
98
99 /* If configuration hasn't been set yet, */
100 /* then set it to the defaults. */
101
102 if (s_configurationSet == FALSE)
103 {
104 s_configInfo.useMaybes = ALI_USE_MAYBES;
105 s_configInfo.readBuffSize = ALI_READ_BUFFSIZE;
106 s_configInfo.debugLevel = ALI_SHOW_NONE;
107 s_configInfo.corruptSeqThreshold = ALI_CORRUPT_SEQ_THRESHOLD;
108 s_configInfo.nuclLineMinThreshold = ALI_NUCL_LINE_MIN_THRESHOLD;
109 s_configInfo.nuclLineMaxThreshold = ALI_NUCL_LINE_MAX_THRESHOLD;
110 s_configInfo.errExpandLevel = ALI_ERRMSG_EXPAND_SOME;
111 s_configInfo.declaredInfo.dataType = ALI_UNKNOWN;
112 s_configInfo.declaredInfo.contigOrInter = ALI_UNKNOWN;
113 s_configInfo.declaredInfo.idCount = 0;
114 s_configInfo.declaredInfo.seqLength = 0;
115
116 s_configInfo.gapChar = (CharPtr) MemNew (32);
117 sprintf (s_configInfo.gapChar , "%c%c", ALI_GAP_CHAR, '.');
118 s_configInfo.missingChar = (CharPtr) MemNew (32);
119 sprintf (s_configInfo.missingChar, "%c", ALI_MISSING_CHAR);
120
121 s_configurationSet = TRUE;
122 }
123
124 /* Copy the current settings to the return struct */
125
126 configPtr = (AliConfigInfoPtr) MemNew (sizeof (AliConfigInfo));
127 MemSet (configPtr, 0, sizeof (AliConfigInfo));
128
129 configPtr->useMaybes = s_configInfo.useMaybes;
130 configPtr->readBuffSize = s_configInfo.readBuffSize;
131 configPtr->debugLevel = s_configInfo.debugLevel;
132 configPtr->corruptSeqThreshold = s_configInfo.corruptSeqThreshold;
133 configPtr->nuclLineMinThreshold = s_configInfo.nuclLineMinThreshold;
134 configPtr->nuclLineMaxThreshold = s_configInfo.nuclLineMaxThreshold;
135 configPtr->errExpandLevel = s_configInfo.errExpandLevel;
136
137 configPtr->gapChar = (CharPtr) MemNew (32);
138 StringCpy (configPtr->gapChar, s_configInfo.gapChar);
139 configPtr->missingChar = (CharPtr) MemNew (32);
140 StringCpy (configPtr->missingChar, s_configInfo.missingChar);
141
142 /* Return successfully */
143
144 return configPtr;
145 }
146
147 /*=========================================================================*/
148 /* */
149 /* Ali_SetConfig () - Sets various runtime configuration options used by */
150 /* the Ali_Read () function. */
151 /* */
152 /* configPtr */
153 /* --------- */
154 /* */
155 /* The configPtr parameter contains new values for one or more */
156 /* configuration settings. The values that are applied are selected by */
157 /* the options parameter. */
158 /* */
159 /* gapChar - [default: '-'] -- This is the character that will be used */
160 /* as the gap character if the file does not define one. */
161 /* */
162 /* missingChar - [default: '?'] -- This is the character that will be */
163 /* used as missing character if the file does not define */
164 /* one. */
165 /* */
166 /* useMaybes - [default: FALSE] -- If a line is found that doesn't */
167 /* quite meet the criteria for being a sequence, but is */
168 /* close enough that it might be a slightly mangled */
169 /* sequence line, then it is marked as a 'maybe'. The */
170 /* useMaybes setting determines how these 'maybe' */
171 /* sequences are treated. If set to FALSE, they ARE NOT */
172 /* treated as sequences, if set to TRUE they ARE treated */
173 /* as sequences. */
174 /* */
175 /* readBuffSize - [default: 2048] -- This is size (in bytes) of the */
176 /* chunks that are read when reading in the file. */
177 /* Setting it to higher values may increase the */
178 /* efficiency, but with operating system and hardware */
179 /* buffering going on, it probably doesn't make much */
180 /* difference. */
181 /* */
182 /* debugLevel - [default: ALI_SHOW_NONE] -- Determines what debugging */
183 /* information to display to stderr during processing. */
184 /* Can be set to one of the following: */
185 /* */
186 /* ALI_SHOW_NONE : Show no debugging info [default] */
187 /* ALI_SHOW_SEQUENCES : Show lines classified as seqs */
188 /* ALI_SHOW_DEFLINES : Show lines classified as deflines*/
189 /* ALI_SHOW_OTHERS : Show lines classified as others */
190 /* (ie, not sequences or deflines). */
191 /* ALI_SHOW_ALL : Show all lines and their */
192 /* classification. */
193 /* */
194 /* corruptSeqThreshold - [Default: 95] -- Used to guess that a line is */
195 /* actually a corrupted sequence. If the line */
196 /* contains a percentage of sequence characters */
197 /* equal to or above the corruptSeqThreshold */
198 /* then it is marked as maybe a sequence line. */
199 /* */
200 /* nuclLineMaxThreshold - [Default: 75] -- Used to determine whether a */
201 /* sequence is DNA or protein. If the line has */
202 /* MORE than nuclLineMaxThreshold percent of */
203 /* the characters "ACGT" and the missing and */
204 /* gap chars (and all the other characters are */
205 /* ambiguous protein/DNA characters), then it */
206 /* is marked as a nucleotide sequence. */
207 /* */
208 /* nuclLineMinThreshold - [Default: 25] -- Used to determine whether a */
209 /* sequence is DNA or protein. If the line has */
210 /* LESS than nuclLineMinThreshold percent of */
211 /* the characters "ACGT" and the missing and */
212 /* gap chars (and all the other characters are */
213 /* ambiguous protein/DNA characters), then it */
214 /* is marked as a protein sequence. */
215 /* */
216 /* errExpandLevel - */
217 /* */
218 /* */
219 /* options parameter */
220 /* ----------------- */
221 /* */
222 /* The options parameter determines which fields in the configPtr are */
223 /* being given new values. It contains one or more of the following */
224 /* values OR'd together : */
225 /* */
226 /* ALI_SET_DEFAULTS */
227 /* ALI_SET_ALL */
228 /* */
229 /* ALI_SET_GAP_CHAR */
230 /* ALI_SET_MISSING_CHAR */
231 /* ALI_SET_MAYBES */
232 /* ALI_SET_READBUFF */
233 /* ALI_SET_NUCL_MIN */
234 /* ALI_SET_NUCL_MAX */
235 /* ALI_SET_CORRUPT_MAX */
236 /* ALI_SET_DEBUG_LEVEL */
237 /* ALI_SET_ERRMSG_EXPAND */
238 /* */
239 /* If ALI_SET_DEFAULTS or ALI_SET_ALL are used then any others are */
240 /* ignored. */
241 /* */
242 /*=========================================================================*/
243
244 Boolean Ali_SetConfig (AliConfigInfoPtr configPtr,
245 Int2 mask)
246 {
247
248 /* If this is the first time called, or we're restoring */
249 /* the defaults, then set all options to the defaults. */
250
251 if ((s_configurationSet == FALSE) ||
252 (configPtr == NULL) ||
253 (mask == ALI_SET_DEFAULTS))
254 {
255 s_configInfo.useMaybes = ALI_USE_MAYBES;
256 s_configInfo.readBuffSize = ALI_READ_BUFFSIZE;
257 s_configInfo.debugLevel = ALI_SHOW_NONE;
258 s_configInfo.corruptSeqThreshold = ALI_CORRUPT_SEQ_THRESHOLD;
259 s_configInfo.nuclLineMinThreshold = ALI_NUCL_LINE_MIN_THRESHOLD;
260 s_configInfo.nuclLineMaxThreshold = ALI_NUCL_LINE_MAX_THRESHOLD;
261 s_configInfo.declaredInfo.dataType = ALI_UNKNOWN;
262 s_configInfo.declaredInfo.contigOrInter = ALI_UNKNOWN;
263 s_configInfo.errExpandLevel = ALI_ERRMSG_EXPAND_SOME;
264 s_configInfo.declaredInfo.idCount = 0;
265 s_configInfo.declaredInfo.seqLength = 0;
266 s_configInfo.gapChar = (CharPtr) MemNew (32);
267 sprintf (s_configInfo.gapChar , "%c%c", ALI_GAP_CHAR, '.');
268 s_configInfo.missingChar = (CharPtr) MemNew (32);
269 sprintf (s_configInfo.missingChar, "%c", ALI_MISSING_CHAR);
270 }
271
272 s_configurationSet = TRUE;
273
274 /* If we're setting to the defaults, then we're done */
275
276 if ((configPtr == NULL) || (mask == ALI_SET_DEFAULTS))
277 return TRUE;
278
279 /* Otherwise, override the current settings */
280 /* where instructed. */
281
282 if ((mask & ALI_SET_GAP_CHAR) || (mask == ALI_SET_ALL))
283 StringCpy (s_configInfo.gapChar, configPtr->gapChar);
284
285 if ((mask & ALI_SET_MISSING_CHAR) || (mask == ALI_SET_ALL))
286 StringCpy (s_configInfo.missingChar, configPtr->missingChar);
287
288 if ((mask & ALI_SET_MAYBES) || (mask == ALI_SET_ALL))
289 s_configInfo.useMaybes = configPtr->useMaybes;
290
291 if ((mask & ALI_SET_READBUFF) || (mask == ALI_SET_ALL))
292 s_configInfo.readBuffSize = configPtr->readBuffSize;
293
294 if ((mask & ALI_SET_NUCL_MIN) || (mask == ALI_SET_ALL))
295 s_configInfo.nuclLineMinThreshold = configPtr->nuclLineMinThreshold;
296
297 if ((mask & ALI_SET_NUCL_MAX) || (mask == ALI_SET_ALL))
298 s_configInfo.nuclLineMaxThreshold = configPtr->nuclLineMaxThreshold;
299
300 if ((mask & ALI_SET_CORRUPT_MAX) || (mask == ALI_SET_ALL))
301 s_configInfo.corruptSeqThreshold = configPtr->corruptSeqThreshold;
302
303 if ((mask & ALI_SET_DEBUG_LEVEL) || (mask == ALI_SET_ALL))
304 s_configInfo.debugLevel = configPtr->debugLevel;
305
306 if ((mask & ALI_SET_ERRMSG_EXPAND) || (mask == ALI_SET_ALL))
307 s_configInfo.errExpandLevel = configPtr->errExpandLevel;
308
309 /* Return successfully */
310
311 return TRUE;
312 }
313
314 /*=========================================================================*/
315 /* */
316 /* s_FreeErrorNode () - Free one error structure. */
317 /* */
318 /*=========================================================================*/
319
320 static void s_FreeErrorNode (ErrInfoPtr errorPtr)
321 {
322 if (errorPtr->info != NULL)
323 {
324 MemFree (errorPtr->info);
325 errorPtr->info = NULL;
326 }
327 if (errorPtr->extraInfo != NULL)
328 {
329 MemFree (errorPtr->extraInfo);
330 errorPtr->extraInfo = NULL;
331 }
332 MemFree (errorPtr);
333 }
334
335 /*=========================================================================*/
336 /* */
337 /* s_FreeErrorList () - Free a linked list of error structures and all */
338 /* the memory that they point to. */
339 /* */
340 /*=========================================================================*/
341
342 static void s_FreeErrorList (ErrInfoPtr errorPtr)
343 {
344 ErrInfoPtr currentErr;
345
346 while (errorPtr != NULL)
347 {
348 currentErr = errorPtr;
349 errorPtr = errorPtr->next;
350 s_FreeErrorNode (currentErr);
351 }
352 }
353
354 /*=========================================================================*/
355 /* */
356 /* s_FreeSequenceList () - Free a linked list of SeqPart structures and */
357 /* all the memory that they point to. */
358 /* */
359 /*=========================================================================*/
360
361 static void s_FreeSequenceList (SeqPartPtr seqPtr)
362 {
363 SeqPartPtr currentSeq;
364
365 while (seqPtr != NULL)
366 {
367 MemFree (seqPtr->sequence);
368 currentSeq = seqPtr;
369 seqPtr = seqPtr->next;
370 MemFree (currentSeq);
371 }
372 }
373
374 /*=========================================================================*/
375 /* */
376 /* s_FreeIdList () - Free a linked list of ID structures and all the */
377 /* memory that they point to. */
378 /* */
379 /*=========================================================================*/
380
381 static void s_FreeIdList (IdInfoPtr idPtr)
382 {
383 IdInfoPtr currentId;
384
385 while (idPtr != NULL)
386 {
387 MemFree (idPtr->id);
388 s_FreeSequenceList (idPtr->sequence);
389 MemFree (idPtr->defline);
390 currentId = idPtr;
391 idPtr = idPtr->next;
392 MemFree (currentId);
393 }
394 }
395
396 /*=========================================================================*/
397 /* */
398 /* s_FreeParsedInfo () - Free a ParsedInfo structure and the memory that */
399 /* it points to. */
400 /* */
401 /*=========================================================================*/
402
403 static void s_FreeParsedInfo (ParsedInfoPtr info)
404 {
405 if (info->missingChar != NULL)
406 MemFree (info->missingChar);
407 if (info->gapChar != NULL)
408 MemFree (info->gapChar);
409 if (info->unalignedChar != NULL)
410 MemFree (info->unalignedChar);
411 MemFree (info);
412 }
413
414 /*=========================================================================*/
415 /* */
416 /* Ali_Free () - Free a AlignFileData structure and all the memory that */
417 /* it points to. */
418 /* */
419 /*=========================================================================*/
420
421 void Ali_Free (AlignFileDataPtr fileInfoPtr)
422 {
423
424 s_FreeIdList (fileInfoPtr->sequences);
425 fileInfoPtr->sequences = NULL;
426 s_FreeIdList (fileInfoPtr->maybes);
427 fileInfoPtr->maybes = NULL;
428 s_FreeErrorList (fileInfoPtr->errors);
429 fileInfoPtr->errors = NULL;
430 s_FreeParsedInfo (fileInfoPtr->info);
431 fileInfoPtr->info = NULL;
432
433 MemFree (fileInfoPtr);
434
435 return;
436 }
437
438 /*=========================================================================*/
439 /* */
440 /* s_FreeRowList () - Free all row data structures and the strings that */
441 /* they point to. */
442 /* */
443 /* NOTE: The actual data strings in the row list may be pointed */
444 /* to by other structures, in which case */
445 /* s_FreeRowList_Safe () should be used instead. */
446 /* */
447 /*=========================================================================*/
448
449 static void s_FreeRowList (ValNodePtr rowList)
450 {
451 ValNodePtr currentRow;
452 SeqLineInfoPtr seqLine;
453 DefLineInfoPtr defLine;
454 OtherLineInfoPtr otherLine;
455
456 while (rowList != NULL)
457 {
458 switch (rowList->choice)
459 {
460 case ALI_DEFLINE :
461 defLine = (DefLineInfoPtr) rowList->data.ptrvalue;
462 if (defLine->definitions != NULL)
463 MemFree (defLine->definitions);
464 if (defLine->id != NULL)
465 MemFree (defLine->id);
466 MemFree (defLine);
467 break;
468 case ALI_SEQLINE :
469 seqLine = (SeqLineInfoPtr) rowList->data.ptrvalue;
470 if (seqLine->sequence != NULL)
471 MemFree (seqLine->sequence);
472 if (seqLine->id != NULL)
473 MemFree (seqLine->id);
474 if (seqLine->junk != NULL)
475 MemFree (seqLine->junk);
476 MemFree (seqLine);
477 break;
478 case ALI_OTHERLINE :
479 otherLine = (OtherLineInfoPtr) rowList->data.ptrvalue;
480 if (otherLine->other != NULL)
481 MemFree (otherLine->other);
482 if (otherLine->id != NULL)
483 MemFree (otherLine->id);
484 MemFree (otherLine);
485 break;
486 default:
487 break;
488 }
489 currentRow = rowList;
490 rowList = rowList->next;
491 MemFree (currentRow);
492 }
493 }
494
495 /*=========================================================================*/
496 /* */
497 /* s_FreeRowList_Safe () - Free all row data structures, but don't free */
498 /* the strings that they point since they are */
499 /* still being used in the ID structures. */
500 /* */
501 /*=========================================================================*/
502
503 static void s_FreeRowList_Safe (ValNodePtr rowList)
504 {
505 ValNodePtr currentRow;
506 SeqLineInfoPtr seqLine;
507 DefLineInfoPtr defLine;
508 OtherLineInfoPtr otherLine;
509
510 while (rowList != NULL)
511 {
512 switch (rowList->choice)
513 {
514 case ALI_DEFLINE :
515 defLine = (DefLineInfoPtr) rowList->data.ptrvalue;
516 MemFree (defLine);
517 break;
518 case ALI_SEQLINE :
519 seqLine = (SeqLineInfoPtr) rowList->data.ptrvalue;
520 MemFree (seqLine);
521 break;
522 case ALI_OTHERLINE :
523 otherLine = (OtherLineInfoPtr) rowList->data.ptrvalue;
524 MemFree (otherLine);
525 break;
526 default:
527 break;
528 }
529 currentRow = rowList;
530 rowList = rowList->next;
531 MemFree (currentRow);
532 }
533 }
534
535 /*=========================================================================*/
536 /* */
537 /* s_GetRowIdStr () */
538 /* */
539 /*=========================================================================*/
540
541 static CharPtr s_GetRowIdString (ValNodePtr row)
542 {
543 CharPtr newIdStr;
544 SeqLineInfoPtr seqLinePtr;
545 DefLineInfoPtr defLinePtr;
546 OtherLineInfoPtr otherLinePtr;
547
548 if (row == NULL)
549 return NULL;
550
551 if (row->choice == ALI_SEQLINE)
552 {
553 seqLinePtr = (SeqLineInfoPtr) row->data.ptrvalue;
554 if (seqLinePtr->id != NULL)
555 {
556 if ((seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == FALSE))
557 newIdStr = NULL;
558 else
559 newIdStr = seqLinePtr->id;
560 }
561 else
562 newIdStr = NULL;
563 }
564 else if (row->choice == ALI_DEFLINE)
565 {
566 defLinePtr = (DefLineInfoPtr) row->data.ptrvalue;
567 if (defLinePtr->id != NULL)
568 newIdStr = defLinePtr->id;
569 else
570 newIdStr = NULL;
571 }
572 else if (row->choice == ALI_OTHERLINE)
573 {
574 otherLinePtr = (OtherLineInfoPtr) row->data.ptrvalue;
575 if (otherLinePtr->id != NULL)
576 newIdStr = otherLinePtr->id;
577 else
578 newIdStr = NULL;
579 }
580
581 return newIdStr;
582 }
583
584 /*=========================================================================*/
585 /* */
586 /* s_GetRowSeqStr () */
587 /* */
588 /*=========================================================================*/
589
590 static CharPtr s_GetRowSeqString (ValNodePtr row)
591 {
592 CharPtr newSeqStr;
593 SeqLineInfoPtr seqLinePtr;
594
595 if (row == NULL)
596 return NULL;
597
598 if (row->choice == ALI_SEQLINE)
599 {
600 seqLinePtr = (SeqLineInfoPtr) row->data.ptrvalue;
601 if (seqLinePtr->sequence != NULL)
602 {
603 if ((seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == FALSE))
604 newSeqStr = NULL;
605 else
606 newSeqStr = seqLinePtr->sequence;
607 }
608 else
609 newSeqStr = NULL;
610 }
611 else
612 newSeqStr = NULL;
613
614 return newSeqStr;
615 }
616
617 /*=========================================================================*/
618 /* */
619 /* s_ProcessMaybes () */
620 /* */
621 /*=========================================================================*/
622
623 static IdInfoPtr s_ProcessMaybes (ValNodePtr rowList)
624 {
625 ValNodePtr currentRow;
626 IdInfoPtr badIdList = NULL;
627 IdInfoPtr existingId = NULL;
628 IdInfoPtr currentId = NULL;
629 IdInfoPtr lastId = NULL;
630 CharPtr idStr;
631 CharPtr currentIdStr;
632 SeqPartPtr newSeqPart;
633 SeqPartPtr lastSeqPart;
634 SeqLineInfoPtr seqLinePtr;
635
636 currentRow = rowList;
637
638 while (currentRow != NULL)
639 {
640 idStr = s_GetRowIdString (currentRow);
641 if (idStr != NULL)
642 currentIdStr = idStr;
643
644 if (currentRow->choice == ALI_SEQLINE)
645 {
646 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
647 if (seqLinePtr->maybe == TRUE)
648 {
649
650 /* Find the ID that this sequence 'belongs to' */
651
652 existingId = badIdList;
653 while (existingId != NULL)
654 {
655 if (StringCmp(existingId->id,currentIdStr) == 0)
656 break;
657 existingId = existingId->next;
658 }
659
660 if (existingId != NULL)
661 currentId = existingId;
662 else
663 {
664 currentId = (IdInfoPtr) MemNew (sizeof(IdInfo));
665 if (currentId == NULL)
666 return NULL;
667
668 currentId->sequence = NULL;
669 currentId->id = currentIdStr;
670 currentId->length = 0;
671 currentId->next = NULL;
672
673 if (badIdList == NULL)
674 badIdList = currentId;
675 else
676 {
677 lastId = badIdList;
678 while (lastId->next != NULL)
679 lastId = lastId->next;
680 lastId->next = currentId;
681 }
682 }
683
684 /* Add the sequence to the current ID */
685
686 newSeqPart = (SeqPartPtr) MemNew(sizeof(SeqPart));
687 if (newSeqPart == NULL)
688 return NULL;
689
690 newSeqPart->sequence = (CharPtr) currentRow->data.ptrvalue;
691 newSeqPart->next = NULL;
692
693 if (currentId->sequence == NULL)
694 currentId->sequence = newSeqPart;
695 else
696 lastSeqPart->next = newSeqPart;
697
698 currentId->length += StringLen (newSeqPart->sequence);
699 lastSeqPart = newSeqPart;
700
701 }
702 }
703 currentRow = currentRow->next;
704 }
705
706 return badIdList;
707 }
708
709 /*=========================================================================*/
710 /* */
711 /* DisplayRowList() - Prints to stderr the linked list of ValNodes that */
712 /* contain the data read in from the alignment file. */
713 /* */
714 /*=========================================================================*/
715
716 static void s_DisplayRowList (ValNodePtr rowList,
717 Int2 mask)
718 {
719 ValNodePtr currRow;
720 SeqLineInfoPtr seqLinePtr;
721 DefLineInfoPtr defLinePtr;
722 OtherLineInfoPtr otherLinePtr;
723 Char cLineType;
724
725 currRow = rowList;
726 while (currRow != NULL)
727 {
728 if ((currRow->choice == ALI_SEQLINE) &&
729 ((mask & ALI_SHOW_SEQUENCES) ||
730 (mask == ALI_SHOW_ALL)))
731 {
732 seqLinePtr = (SeqLineInfoPtr) currRow->data.ptrvalue;
733
734 if (seqLinePtr->type == ALI_NUCLEOTIDE)
735 cLineType = 'N';
736 else if (seqLinePtr->type == ALI_PROTEIN)
737 cLineType = 'P';
738 else if (seqLinePtr->type == ALI_AMBIGUOUS)
739 cLineType = 'U';
740
741 if (seqLinePtr->maybe == FALSE)
742 {
743 if (seqLinePtr->id != NULL)
744 fprintf(stderr,"%04d: ID : %s\n",
745 seqLinePtr->rowNum,
746 seqLinePtr->id);
747 if (seqLinePtr->sequence != NULL)
748 fprintf(stderr,"%04d: SEQUENCE[%c] : %s\n",
749 seqLinePtr->rowNum,
750 cLineType,
751 seqLinePtr->sequence);
752 }
753 else
754 {
755 if (seqLinePtr->id != NULL)
756 fprintf(stderr,"%04d: MAYBE ID : %s\n",
757 seqLinePtr->rowNum,
758 seqLinePtr->id);
759 if (seqLinePtr->sequence != NULL)
760 fprintf(stderr,"%04d: MAYBE SEQUENCE[%c] : %s\n",
761 seqLinePtr->rowNum,
762 cLineType,
763 seqLinePtr->sequence);
764 }
765 }
766 else if ((currRow->choice == ALI_DEFLINE) &&
767 ((mask & ALI_SHOW_DEFLINES) ||
768 (mask == ALI_SHOW_ALL)))
769 {
770 defLinePtr = (DefLineInfoPtr) currRow->data.ptrvalue;
771 if (defLinePtr->id != NULL)
772 fprintf(stderr,"%04d: DEFLINE ID : %s\n",
773 defLinePtr->rowNum,
774 defLinePtr->id);
775 if (defLinePtr->definitions != NULL)
776 fprintf(stderr,"%04d: DEFLINE DEFINITIONS : %s\n",
777 defLinePtr->rowNum,
778 defLinePtr->definitions);
779 }
780 else if ((currRow->choice == ALI_OTHERLINE) &&
781 ((mask & ALI_SHOW_OTHERS) ||
782 (mask == ALI_SHOW_ALL)))
783 {
784 otherLinePtr = (OtherLineInfoPtr) currRow->data.ptrvalue;
785 if (otherLinePtr->id != NULL)
786 fprintf(stderr,"%04d: OTHER ID : %s\n", otherLinePtr->rowNum,
787 otherLinePtr->id);
788 if (otherLinePtr->other != NULL)
789 fprintf(stderr,"%04d: OTHER : %s\n", otherLinePtr->rowNum,
790 otherLinePtr->other);
791 }
792 currRow = currRow->next;
793 }
794
795 return;
796 }
797
798
799 /*=========================================================================*/
800 /* */
801 /* s_isInterleaved () */
802 /* */
803 /*=========================================================================*/
804
805 static Boolean s_IsInterleaved (ValNodePtr rowList,
806 Int2 PNTR idCount)
807 {
808 ValNodePtr currentRow;
809 CharPtr newIdStr;
810 IdInfoPtr idList = NULL;
811 IdInfoPtr lastId = NULL;
812 IdInfoPtr currentId = NULL;
813 IdInfoPtr existingId = NULL;
814 Boolean isInterleaved;
815 Int4 patternRowCount;
816 Int4 patternCharCount;
817 Int4 currentRowCount;
818 Int4 currentCharCount;
819 Boolean isFirstId;
820 SeqLineInfoPtr seqLinePtr;
821 DefLineInfoPtr defLinePtr;
822 OtherLineInfoPtr otherLinePtr;
823 Boolean isMaybe;
824
825 isInterleaved = FALSE;
826 currentRow = rowList;
827
828 patternRowCount = 0;
829 patternCharCount = 0;
830 currentRowCount = 0;
831 currentCharCount = 0;
832 isFirstId = TRUE;
833 *idCount = 0;
834
835 /* Search the row list for IDs */
836
837 while (currentRow != NULL)
838 {
839
840 /* Look for an ID */
841
842 newIdStr = NULL;
843 isMaybe = FALSE;
844
845 if (currentRow->choice == ALI_SEQLINE)
846 {
847 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
848 if (seqLinePtr->id != NULL)
849 {
850 if ((seqLinePtr->maybe == TRUE) &&
851 (s_configInfo.useMaybes == FALSE))
852 newIdStr = NULL;
853 else
854 newIdStr = seqLinePtr->id;
855 }
856 }
857 else if (currentRow->choice == ALI_DEFLINE)
858 {
859 defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue;
860 if (defLinePtr->id != NULL)
861 newIdStr = defLinePtr->id;
862 }
863 else if (currentRow->choice == ALI_OTHERLINE)
864 {
865 otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue;
866 if (otherLinePtr->id != NULL)
867 newIdStr = otherLinePtr->id;
868 }
869
870 /* If we find an ID, see if it's one */
871 /* that we already have. */
872
873 if (newIdStr != NULL)
874 {
875
876 existingId = idList;
877 while (existingId != NULL)
878 {
879 if (StringCmp(existingId->id,newIdStr) == 0)
880 break;
881 existingId = existingId->next;
882 }
883
884 /* Already have -- break and return TRUE */
885
886 if (existingId != NULL)
887 {
888 isInterleaved = TRUE;
889 break;
890 }
891
892 /* Otherwise, add the ID to the list */
893
894 currentRowCount = 0;
895 currentCharCount = 0;
896
897 if (idList != NULL)
898 isFirstId = FALSE;
899
900 (*idCount)++;
901
902 currentId = (IdInfoPtr) MemNew (sizeof(IdInfo));
903 if (currentId == NULL)
904 return FALSE;
905
906 currentId->sequence = NULL;
907 currentId->id = newIdStr;
908 currentId->length = 0;
909 currentId->next = NULL;
910
911 if (idList == NULL)
912 idList = currentId;
913 else
914 {
915 lastId = idList;
916 while (lastId->next != NULL)
917 lastId = lastId->next;
918 lastId->next = currentId;
919 }
920 }
921
922 /* Process sequence rows */
923
924 if (currentRow->choice == ALI_SEQLINE)
925 {
926
927 if (seqLinePtr->sequence != NULL)
928 if ((s_configInfo.useMaybes == TRUE) ||
929 (s_configInfo.useMaybes == FALSE) &&
930 (seqLinePtr->maybe == FALSE))
931 {
932 /* There must be an ID before the first sequence */
933
934 if (currentId == NULL)
935 {
936 isInterleaved = FALSE;
937 break;
938 }
939
940 /* Look for sequences that probably */
941 /* have no ID assigned to them. */
942
943 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
944 if (isFirstId)
945 {
946 patternRowCount++;
947 patternCharCount += StringLen (seqLinePtr->sequence);
948 }
949 else
950 {
951 currentRowCount++;
952 currentCharCount += StringLen (seqLinePtr->sequence);
953 if ((currentRowCount > patternRowCount) &&
954 (currentCharCount > patternCharCount))
955 {
956 isInterleaved = TRUE;
957 break;
958 }
959 }
960 }
961
962 }
963
964 /* Go to next row */
965
966 currentRow = currentRow->next;
967 }
968
969 /* Delete the ID records that we created */
970 /* NOTE -- The ID strings themselves */
971 /* are stored elsewhere and */
972 /* only pointed to here, so */
973 /* DON"T delete them. o */
974
975 while (idList != NULL)
976 {
977 lastId = idList;
978 idList = idList->next;
979 MemFree(lastId);
980 }
981
982 /* Return result of search */
983
984 return isInterleaved;
985 }
986
987 /*=========================================================================*/
988 /* */
989 /* s_ProcessInterId () */
990 /* */
991 /*=========================================================================*/
992
993 static Boolean s_ProcessInterId (CharPtr newIdStr,
994 PatternInfoPtr pattern,
995 AlignFileDataPtr fileInfoPtr,
996 Boolean isMaybe)
997 {
998 IdInfoPtr lastId = NULL;
999 IdInfoPtr existingId = NULL;
1000 ErrInfoPtr errPtr;
1001
1002 /* If we've got all our ID's then */
1003 /* ignore any further ones. */
1004
1005 if (pattern->gotAllIds == TRUE)
1006 return TRUE;
1007
1008 /* All ID's, except for the first one, should */
1009 /* immediately follow a sequence line. */
1010
1011 if (pattern->isFirstId == FALSE)
1012 {
1013 if (pattern->lastRow->choice != ALI_SEQLINE)
1014 {
1015 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1016 ERR_ID_NO_PRECEDING_SEQ,
1017 newIdStr);
1018 return FALSE;
1019 }
1020 else
1021 pattern->isFirstGroup = FALSE;
1022 }
1023
1024 /* If this id already exists, */
1025 /* make it the current ID. */
1026
1027 existingId = fileInfoPtr->sequences;
1028 while (existingId != NULL)
1029 {
1030 if (StringCmp(existingId->id,newIdStr) == 0)
1031 break;
1032 existingId = existingId->next;
1033 }
1034
1035 if (existingId != NULL)
1036 pattern->currentId = existingId;
1037
1038 /* Otherwise create a new Id record */
1039 /* and add it to the end of list. */
1040
1041 else
1042 {
1043 pattern->currentId = (IdInfoPtr) MemNew (sizeof(IdInfo));
1044 if (pattern->currentId == NULL)
1045 {
1046 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
1047 return FALSE;
1048 }
1049
1050 pattern->currentId->sequence = NULL;
1051 pattern->currentId->id = newIdStr;
1052 pattern->currentId->length = 0;
1053 pattern->currentId->next = NULL;
1054
1055 if (fileInfoPtr->sequences == NULL)
1056 fileInfoPtr->sequences = pattern->currentId;
1057 else
1058 {
1059 lastId = fileInfoPtr->sequences;
1060 while (lastId->next != NULL)
1061 lastId = lastId->next;
1062 lastId->next = pattern->currentId;
1063 }
1064
1065 pattern->currentIdCount++;
1066 if (pattern->currentIdCount == pattern->idCount)
1067 pattern->gotAllIds = TRUE;
1068 }
1069
1070 if (pattern->isFirstId)
1071 pattern->isFirstId = FALSE;
1072
1073 /* Return successfully */
1074
1075 return TRUE;
1076 }
1077
1078 /*=========================================================================*/
1079 /* */
1080 /* s_ProcessInterSeq () */
1081 /* */
1082 /*=========================================================================*/
1083
1084 static Boolean s_ProcessInterSeq (CharPtr newSeqStr,
1085 PatternInfoPtr pattern,
1086 AlignFileDataPtr fileInfoPtr,
1087 Boolean isMaybe)
1088 {
1089 SeqPartPtr newSeqPart = NULL;
1090 ErrInfoPtr errPtr = NULL;
1091
1092 /* There must be an ID before the first sequence */
1093
1094 if (pattern->currentId == NULL)
1095 {
1096 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_SEQ_WITHOUT_ID,
1097 newSeqStr);
1098 return FALSE;
1099 }
1100
1101 /* Add the sequence to the current ID */
1102
1103 newSeqPart = (SeqPartPtr) MemNew(sizeof(SeqPart));
1104 if (newSeqPart == NULL)
1105 {
1106 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
1107 return FALSE;
1108 }
1109
1110 newSeqPart->sequence = newSeqStr;
1111 newSeqPart->next = NULL;
1112
1113 if (pattern->currentId->sequence == NULL)
1114 pattern->currentId->sequence = newSeqPart;
1115 else
1116 pattern->lastSeqPart->next = newSeqPart;
1117
1118 pattern->currentId->length += StringLen (newSeqPart->sequence);
1119 pattern->lastSeqPart = newSeqPart;
1120
1121 /* If we've started repeating IDs then */
1122 /* rotate through the id list. */
1123
1124 if (pattern->gotAllIds == TRUE)
1125 {
1126 if (pattern->currentId->next == NULL)
1127 pattern->currentId = fileInfoPtr->sequences;
1128 else
1129 pattern->currentId = pattern->currentId->next;
1130
1131 pattern->lastSeqPart = pattern->currentId->sequence;
1132 while (pattern->lastSeqPart->next != NULL)
1133 pattern->lastSeqPart = pattern->lastSeqPart->next;
1134 }
1135
1136 /* Return successfully */
1137
1138 return TRUE;
1139 }
1140
1141 /*=========================================================================*/
1142 /* */
1143 /* s_AnalyzeInterleaved () */
1144 /* */
1145 /*=========================================================================*/
1146
1147 static Boolean s_AnalyzeInterleaved (ValNodePtr rowList,
1148 AlignFileDataPtr fileInfoPtr,
1149 Int2 idCount)
1150 {
1151 ValNodePtr currentRow;
1152 Boolean isValidPattern;
1153 IdInfoPtr currentId = NULL;
1154 Int4 previousLength;
1155 ErrInfoPtr errPtr;
1156 PatternInfoPtr pattern;
1157 SeqLineInfoPtr seqLinePtr;
1158 DefLineInfoPtr defLinePtr;
1159 OtherLineInfoPtr otherLinePtr;
1160 Boolean firstDefline = TRUE;
1161 IdInfoPtr lastId = NULL;
1162
1163 pattern = (PatternInfoPtr) MemNew (sizeof (PatternInfo));
1164
1165 pattern->currentDeflineId = NULL;
1166 pattern->lastRow = NULL;
1167 pattern->isFirstId = TRUE;
1168 pattern->isFirstGroup = TRUE;
1169 pattern->maybesFound = FALSE;
1170 pattern->gotAllIds = FALSE;
1171 pattern->idCount = idCount;
1172 pattern->currentIdCount = 0;
1173
1174 pattern->foundInfo.dataType = ALI_UNKNOWN;
1175 pattern->foundInfo.contigOrInter = ALI_UNKNOWN;
1176 pattern->foundInfo.idCount = 0;
1177 pattern->foundInfo.seqLength = 0;
1178
1179 /* Match the sequences up with the IDs */
1180
1181 currentRow = rowList;
1182 isValidPattern = TRUE;
1183
1184 while (currentRow != NULL)
1185 {
1186
1187 if (currentRow->choice == ALI_SEQLINE)
1188 {
1189
1190 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
1191
1192 if ((seqLinePtr->maybe == FALSE) ||
1193 (seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == TRUE))
1194 {
1195 if (seqLinePtr->id != NULL)
1196 {
1197 /* Process the ID */
1198
1199 isValidPattern = s_ProcessInterId (seqLinePtr->id,
1200 pattern,
1201 fileInfoPtr,
1202 seqLinePtr->maybe);
1203 if (isValidPattern == FALSE)
1204 break;
1205 }
1206
1207 if (seqLinePtr->sequence != NULL)
1208 {
1209 isValidPattern = s_ProcessInterSeq (seqLinePtr->sequence,
1210 pattern,
1211 fileInfoPtr,
1212 seqLinePtr->maybe);
1213 if (isValidPattern == FALSE)
1214 break;
1215 }
1216 pattern->lastRow = currentRow;
1217 }
1218 else
1219 pattern->maybesFound = TRUE;
1220 }
1221 else if (currentRow->choice == ALI_DEFLINE)
1222 {
1223 defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue;
1224 if (defLinePtr->id != NULL)
1225 {
1226 isValidPattern = s_ProcessInterId (defLinePtr->id,
1227 pattern,
1228 fileInfoPtr,
1229 FALSE);
1230 if (isValidPattern == FALSE)
1231 break;
1232 }
1233 if (defLinePtr->definitions != NULL)
1234 {
1235 if (firstDefline)
1236 {
1237 firstDefline = FALSE;
1238 pattern->currentDeflineId = fileInfoPtr->sequences;
1239 }
1240 else
1241 pattern->currentDeflineId =
1242 pattern->currentDeflineId->next;
1243
1244 if (pattern->currentDeflineId == NULL)
1245 {
1246 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1247 ERR_DEFLINE_WITH_NO_ID,
1248 defLinePtr->definitions);
1249 errPtr->rowNum = defLinePtr->rowNum;
1250 isValidPattern = FALSE;
1251 break;
1252 }
1253 else
1254 {
1255 pattern->currentDeflineId->defline =
1256 defLinePtr->definitions;
1257 }
1258 }
1259 pattern->lastRow = currentRow;
1260 }
1261 else if (currentRow->choice == ALI_OTHERLINE)
1262 {
1263 otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue;
1264 if (otherLinePtr->id != NULL)
1265 {
1266 isValidPattern = s_ProcessInterId (otherLinePtr->id,
1267 pattern,
1268 fileInfoPtr,
1269 FALSE);
1270 if (isValidPattern == FALSE)
1271 break;
1272 }
1273 pattern->lastRow = currentRow;
1274 }
1275
1276 currentRow = currentRow->next;
1277 }
1278
1279 /* If we found one defline, then */
1280 /* make sure they were all there */
1281
1282 if (firstDefline == FALSE)
1283 {
1284 lastId = fileInfoPtr->sequences;
1285 if (lastId != NULL)
1286 {
1287 while (lastId->next != NULL)
1288 lastId = lastId->next;
1289 if (lastId->defline == NULL)
1290 {
1291 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1292 ERR_ID_WITH_NO_DEFLINE,
1293 lastId->id);
1294 isValidPattern = FALSE;
1295 }
1296 }
1297 }
1298
1299 /* If pattern not found, return failure */
1300
1301 if (!isValidPattern)
1302 return FALSE;
1303
1304 /* If there was a declared number of sequences then */
1305 /* check to see that it matches the number found. */
1306
1307 if ((s_configInfo.declaredInfo.idCount !=0) &&
1308 (s_configInfo.declaredInfo.idCount != idCount))
1309 {
1310 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_ID_COUNT_MISMATCH,
1311 idCount, s_configInfo.declaredInfo.idCount);
1312 errPtr->level = LEVEL_WARNING;
1313 }
1314
1315 /* Sequences should all be the same length. */
1316
1317 currentId = fileInfoPtr->sequences;
1318 pattern->isFirstId = TRUE;
1319
1320 while (currentId != NULL)
1321 {
1322 if (pattern->isFirstId)
1323 pattern->isFirstId = FALSE;
1324 else
1325 {
1326 if (previousLength < currentId->length)
1327 {
1328 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1329 ERR_SEQUENCE_TOO_LONG,
1330 currentId->id,
1331 previousLength,
1332 currentId->length);
1333 break;
1334 }
1335 else if (previousLength > currentId->length)
1336 {
1337 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1338 ERR_SEQUENCE_TOO_SHORT,
1339 currentId->id,
1340 previousLength,
1341 currentId->length);
1342 break;
1343 }
1344 }
1345 previousLength = currentId->length;
1346 currentId = currentId->next;
1347 }
1348
1349 /* Check to see that declared sequence */
1350 /* length matches the lengths found. */
1351
1352 if ((s_configInfo.declaredInfo.seqLength != 0) &&
1353 (s_configInfo.declaredInfo.seqLength != previousLength))
1354 {
1355 errPtr = Ali_AddError (&(fileInfoPtr->errors),ERR_SEQ_LENGTH_MISMATCH,
1356 previousLength,
1357 s_configInfo.declaredInfo.seqLength);
1358 errPtr->level = LEVEL_WARNING;
1359 }
1360
1361 /* Process the maybes if they weren't used already */
1362
1363 if (pattern->maybesFound == TRUE)
1364 fileInfoPtr->maybes = s_ProcessMaybes (rowList);
1365
1366 /* Return successfully */
1367
1368 if (currentId == NULL)
1369 return TRUE;
1370 else
1371 return FALSE;
1372 }
1373
1374 /*=========================================================================*/
1375 /* */
1376 /* s_ProcessContigId () */
1377 /* */
1378 /*=========================================================================*/
1379
1380 static Boolean s_ProcessContigId (CharPtr newIdStr,
1381 PatternInfoPtr pattern,
1382 AlignFileDataPtr fileInfoPtr)
1383 {
1384 IdInfoPtr existingId = NULL;
1385 ErrInfoPtr errPtr;
1386 IdInfoPtr lastId = NULL;
1387
1388 if (pattern->isFirstId == FALSE)
1389 {
1390 pattern->isFirstGroup = FALSE;
1391
1392 /* The length of the last pattern must match */
1393 /* the length of previous ones. */
1394
1395 if (pattern->currentId->length < pattern->foundInfo.seqLength)
1396 {
1397 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1398 ERR_SEQUENCE_TOO_SHORT,
1399 pattern->currentId->id,
1400 pattern->foundInfo.seqLength,
1401 pattern->currentId->length);
1402 return FALSE;
1403 }
1404 }
1405
1406 pattern->hasFullLength = FALSE;
1407
1408 /* See if this ID already exists */
1409
1410 existingId = fileInfoPtr->sequences;
1411 while (existingId != NULL)
1412 {
1413 if (StringCmp(existingId->id,newIdStr) == 0)
1414 {
1415 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_DUPLICATE_IDS,
1416 newIdStr);
1417 return FALSE;
1418 }
1419 existingId = existingId->next;
1420 }
1421
1422 /* If this id already exists, */
1423 /* make it the current ID. */
1424
1425 if (existingId != NULL)
1426 pattern->currentId = existingId;
1427
1428 /* Otherwise create a new Id record */
1429 /* and add it to the end of list. */
1430
1431 else
1432 {
1433 pattern->currentId = (IdInfoPtr) MemNew (sizeof(IdInfo));
1434 if (pattern->currentId == NULL)
1435 {
1436 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
1437 return FALSE;
1438 }
1439
1440 pattern->currentId->sequence = NULL;
1441 pattern->currentId->id = newIdStr;
1442 pattern->currentId->length = 0;
1443 pattern->currentId->next = NULL;
1444
1445 if (fileInfoPtr->sequences == NULL)
1446 fileInfoPtr->sequences = pattern->currentId;
1447 else
1448 {
1449 lastId = fileInfoPtr->sequences;
1450 while (lastId->next != NULL)
1451 lastId = lastId->next;
1452 lastId->next = pattern->currentId;
1453 }
1454 pattern->foundInfo.idCount++;
1455 }
1456
1457 if (pattern->isFirstId)
1458 pattern->isFirstId = FALSE;
1459
1460 /* Return successfully */
1461
1462 return TRUE;
1463 }
1464
1465 /*=========================================================================*/
1466 /* */
1467 /* s_ProcessContigSeq () */
1468 /* */
1469 /*=========================================================================*/
1470
1471 static Boolean s_ProcessContigSeq (CharPtr newSeqStr,
1472 PatternInfoPtr pattern,
1473 AlignFileDataPtr fileInfoPtr)
1474 {
1475 SeqPartPtr newSeqPart = NULL;
1476 ErrInfoPtr errPtr;
1477
1478 /* There must be an ID before we get a sequence */
1479
1480 if (pattern->currentId == NULL)
1481 {
1482 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_SEQ_WITHOUT_ID,
1483 newSeqStr);
1484 return FALSE;
1485 }
1486
1487 /* Add the sequence to the current ID */
1488
1489 newSeqPart = (SeqPartPtr) MemNew(sizeof(SeqPart));
1490 if (newSeqPart == NULL)
1491 {
1492 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
1493 return FALSE;
1494 }
1495
1496 newSeqPart->sequence = newSeqStr;
1497 newSeqPart->next = NULL;
1498
1499 if (pattern->currentId->sequence == NULL)
1500 pattern->currentId->sequence = newSeqPart;
1501 else
1502 pattern->lastSeqPart->next = newSeqPart;
1503
1504 /* Make sure that sequence length hasn't */
1505 /* exceeded that of previous sequences. */
1506
1507 pattern->currentId->length += StringLen (newSeqPart->sequence);
1508 pattern->lastSeqPart = newSeqPart;
1509
1510 if (pattern->isFirstGroup)
1511 {
1512 pattern->foundInfo.seqLength += StringLen (newSeqPart->sequence);
1513 }
1514 else
1515 {
1516
1517 if (pattern->currentId->length == pattern->foundInfo.seqLength)
1518 pattern->hasFullLength = TRUE;
1519 else if (pattern->currentId->length > pattern->foundInfo.seqLength)
1520 {
1521 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1522 ERR_SEQUENCE_TOO_LONG,
1523 pattern->currentId->id,
1524 pattern->foundInfo.seqLength,
1525 pattern->currentId->length);
1526 return FALSE;
1527 }
1528 }
1529
1530 /* Return successfully */
1531
1532 return TRUE;
1533 }
1534
1535 /*=========================================================================*/
1536 /* */
1537 /* s_AnalyzeContiguous () */
1538 /* */
1539 /*=========================================================================*/
1540
1541 static Boolean s_AnalyzeContiguous (ValNodePtr rowList,
1542 AlignFileDataPtr fileInfoPtr)
1543 {
1544 ValNodePtr currentRow;
1545 SeqLineInfoPtr seqLinePtr;
1546 DefLineInfoPtr defLinePtr;
1547 OtherLineInfoPtr otherLinePtr;
1548 Boolean isValidPattern;
1549 IdInfoPtr lastId = NULL;
1550 IdInfoPtr nextToLastId = NULL;
1551 ErrInfoPtr errPtr;
1552 PatternInfoPtr pattern;
1553 Boolean firstDefline = TRUE;
1554
1555 /* Initialize the pattern info */
1556
1557 pattern = (PatternInfoPtr) MemNew (sizeof (PatternInfo));
1558
1559 pattern->currentDeflineId = NULL;
1560 pattern->currentId = NULL;
1561 pattern->lastSeqPart = NULL;
1562 pattern->hasFullLength = FALSE;
1563 pattern->isFirstId = TRUE;
1564 pattern->isFirstGroup = TRUE;
1565 pattern->maybesFound = FALSE;
1566
1567 pattern->foundInfo.dataType = ALI_UNKNOWN;
1568 pattern->foundInfo.contigOrInter = ALI_UNKNOWN;
1569 pattern->foundInfo.idCount = 0;
1570 pattern->foundInfo.seqLength = 0;
1571
1572 /* Match the sequences up with the IDS */
1573
1574 currentRow = rowList;
1575 isValidPattern = TRUE;
1576
1577 while (currentRow != NULL)
1578 {
1579
1580 /* Process sequence lines */
1581
1582 if (currentRow->choice == ALI_SEQLINE)
1583 {
1584
1585 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
1586
1587 /* If we already have a sequence equal in */
1588 /* in length to those that came before, */
1589 /* then this line may actually be an */
1590 /* ID. */
1591
1592
1593 if ((pattern->hasFullLength == TRUE) &&
1594 (seqLinePtr->id == NULL))
1595 {
1596 Ali_ChangeRowToOther (currentRow);
1597 continue;
1598 }
1599
1600 /* Process the line as a sequence */
1601
1602 if ((seqLinePtr->maybe == FALSE) ||
1603 (seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == TRUE))
1604 {
1605 if (seqLinePtr->id != NULL)
1606 {
1607 isValidPattern = s_ProcessContigId (seqLinePtr->id,
1608 pattern,
1609 fileInfoPtr);
1610 if (isValidPattern == FALSE)
1611 break;
1612 }
1613
1614 if (seqLinePtr->sequence != NULL)
1615 {
1616 isValidPattern = s_ProcessContigSeq (seqLinePtr->sequence,
1617 pattern,
1618 fileInfoPtr);
1619 if (isValidPattern == FALSE)
1620 break;
1621 }
1622 pattern->lastRow = currentRow;
1623 }
1624 else
1625 pattern->maybesFound = TRUE;
1626 }
1627
1628 /* Process Definition lines */
1629
1630 else if (currentRow->choice == ALI_DEFLINE)
1631 {
1632 defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue;
1633 if (defLinePtr->id != NULL)
1634 {
1635 isValidPattern = s_ProcessContigId (defLinePtr->id,
1636 pattern,
1637 fileInfoPtr);
1638 if (isValidPattern == FALSE)
1639 break;
1640 }
1641
1642 if (defLinePtr->definitions != NULL)
1643 {
1644 if (firstDefline)
1645 {
1646 firstDefline = FALSE;
1647 pattern->currentDeflineId = fileInfoPtr->sequences;
1648 }
1649 else
1650 pattern->currentDeflineId =
1651 pattern->currentDeflineId->next;
1652
1653 if (pattern->currentDeflineId == NULL)
1654 {
1655 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1656 ERR_DEFLINE_WITH_NO_ID,
1657 defLinePtr->definitions);
1658 errPtr->rowNum = defLinePtr->rowNum;
1659 isValidPattern = FALSE;
1660 break;
1661 }
1662 else
1663 {
1664 pattern->currentDeflineId->defline =
1665 defLinePtr->definitions;
1666 }
1667 }
1668 pattern->lastRow = currentRow;
1669 }
1670
1671 /* Process Other lines */
1672
1673 else if (currentRow->choice == ALI_OTHERLINE)
1674 {
1675 otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue;
1676 if (otherLinePtr->id != NULL)
1677 {
1678 isValidPattern = s_ProcessContigId (otherLinePtr->id,
1679 pattern,
1680 fileInfoPtr);
1681 if (isValidPattern == FALSE)
1682 break;
1683 }
1684 pattern->lastRow = currentRow;
1685 }
1686
1687 currentRow = currentRow->next;
1688 }
1689
1690 /* If the last sequence is too short, mark */
1691 /* it as a maybe. */
1692
1693 if (pattern->lastRow->choice == ALI_SEQLINE)
1694 {
1695 if (s_configInfo.useMaybes == FALSE)
1696 {
1697 pattern->maybesFound = TRUE;
1698 if (pattern->currentId->length < pattern->foundInfo.seqLength)
1699 {
1700 seqLinePtr = (SeqLineInfoPtr)pattern->lastRow->data.ptrvalue;
1701 seqLinePtr->maybe = TRUE;
1702 nextToLastId = NULL;
1703 lastId = fileInfoPtr->sequences;
1704 while (lastId->next != NULL)
1705 {
1706 nextToLastId = lastId;
1707 lastId = lastId->next;
1708 }
1709 MemFree(lastId);
1710 if (nextToLastId == NULL)
1711 fileInfoPtr->sequences = NULL;
1712 else
1713 nextToLastId->next = NULL;
1714 }
1715 }
1716 else
1717 {
1718 if (pattern->currentId->length < pattern->foundInfo.seqLength)
1719 {
1720 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1721 ERR_SEQUENCE_TOO_SHORT,
1722 pattern->currentId->id,
1723 pattern->foundInfo.seqLength,
1724 pattern->currentId->length);
1725 isValidPattern = FALSE;
1726 }
1727 }
1728 }
1729
1730 /* If we found one defline, then */
1731 /* make sure they were all there */
1732
1733 if (firstDefline == FALSE)
1734 {
1735 lastId = fileInfoPtr->sequences;
1736 if (lastId != NULL)
1737 {
1738 while (lastId->next != NULL)
1739 lastId = lastId->next;
1740 if (lastId->defline == NULL)
1741 {
1742 errPtr = Ali_AddError (&(fileInfoPtr->errors),
1743 ERR_ID_WITH_NO_DEFLINE,
1744 lastId->id);
1745 isValidPattern = FALSE;
1746 }
1747 }
1748 }
1749
1750 /* If pattern not found, return failure */
1751
1752 if (!isValidPattern)
1753 {
1754 MemFree (pattern);
1755 return FALSE;
1756 }
1757
1758 /* Check for inconsistant declarations ... */
1759
1760 /* ... of file type */
1761
1762 if (s_configInfo.declaredInfo.contigOrInter == ALI_INTERLEAVED)
1763 {
1764 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_NOT_INTERLEAVED);
1765 errPtr->level = LEVEL_WARNING;
1766 }
1767
1768 /* ... of number of sequences */
1769
1770 if ((s_configInfo.declaredInfo.idCount != 0) &&
1771 (s_configInfo.declaredInfo.idCount != pattern->foundInfo.idCount))
1772 {
1773 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_ID_COUNT_MISMATCH,
1774 pattern->foundInfo.idCount,
1775 s_configInfo.declaredInfo.idCount);
1776 errPtr->level = LEVEL_WARNING;
1777 }
1778
1779 /* ... of sequence length */
1780
1781 if ((s_configInfo.declaredInfo.seqLength != 0) &&
1782 (s_configInfo.declaredInfo.seqLength != pattern->foundInfo.seqLength))
1783 {
1784 errPtr = Ali_AddError (&(fileInfoPtr->errors),ERR_SEQ_LENGTH_MISMATCH,
1785 pattern->foundInfo.seqLength,
1786 s_configInfo.declaredInfo.seqLength);
1787 errPtr->level = LEVEL_WARNING;
1788 }
1789
1790 /* If we have some possibly bad sequences that */
1791 /* weren't used, process them seperately. */
1792
1793 if (pattern->maybesFound == TRUE)
1794 fileInfoPtr->maybes = s_ProcessMaybes (rowList);
1795
1796 /* Clean up and return successfully */
1797
1798 if (pattern->currentId != NULL)
1799 {
1800 MemFree (pattern);
1801 return FALSE;
1802 }
1803 else
1804 {
1805 MemFree (pattern);
1806 return TRUE;
1807 }
1808 }
1809
1810 /*=========================================================================*/
1811 /* */
1812 /* Ali_AddError () */
1813 /* */
1814 /*=========================================================================*/
1815
1816 ErrInfoPtr Ali_AddError (ErrInfoPtr PNTR errorListPtr,
1817 Int4 iError,
1818 ...)
1819 {
1820 ErrInfoPtr newError;
1821 ErrInfoPtr lastError;
1822 va_list argPtr;
1823 CharPtr seqId;
1824 CharPtr seqStr;
1825 Int4 seqLength;
1826 Int4 prevSeqLength;
1827 CharPtr defLineStr;
1828 Int4 foundCount;
1829 Int4 declaredCount;
1830 Int4 foundLen;
1831 Int4 declaredLen;
1832 Int4 sequenceCount;
1833 Int4 errorCount;
1834 Int4 invalidChar;
1835
1836 static Int4 count = 0;
1837
1838 count++;
1839
1840 /* Create a new error record */
1841
1842 newError = (ErrInfoPtr) MemNew (sizeof(ErrInfo));
1843 newError->errNum = iError;
1844 newError->level = LEVEL_ERROR;
1845 newError->rowNum = 0;
1846 newError->extraInfo = NULL;
1847 newError->next = NULL;
1848
1849 /* Build the error message text */
1850
1851 va_start (argPtr, iError);
1852
1853 switch (iError)
1854 {
1855 case ERR_ID_WITHOUT_SEQ :
1856 seqId = va_arg (argPtr, CharPtr);
1857 newError->info = (CharPtr) MemNew (strlen (seqId) + 80);
1858 sprintf (newError->info, "Unable to match ID %s to any sequence", seqId);
1859 break;
1860 case ERR_SEQ_WITHOUT_ID :
1861 seqStr = va_arg (argPtr, CharPtr);
1862 newError->info = (CharPtr) MemNew (strlen (seqStr) + 80);
1863 sprintf (newError->info, "There is no ID for the sequence:\n%s", seqStr);
1864 break;
1865 case ERR_DUPLICATE_IDS :
1866 seqId = va_arg (argPtr, CharPtr);
1867 newError->info = (CharPtr) MemNew (strlen (seqId) + 80);
1868 sprintf (newError->info, "Duplicate ID: %s is used more than once",
1869 seqId);
1870 break;
1871 case ERR_SEQUENCE_TOO_SHORT :
1872 seqId = va_arg (argPtr, CharPtr);
1873 prevSeqLength = va_arg (argPtr, Int4);
1874 seqLength = va_arg (argPtr, Int4);
1875 newError->info = (CharPtr) MemNew (strlen (seqId) + 256);
1876 sprintf (newError->info,
1877 "Sequence %s is shorter (%d characters) than the preceding"
1878 " sequences (%d characters)", seqId, seqLength, prevSeqLength);
1879 break;
1880 case ERR_SEQUENCE_TOO_LONG :
1881 seqId = va_arg (argPtr, CharPtr);
1882 prevSeqLength = va_arg (argPtr, Int4);
1883 seqLength = va_arg (argPtr, Int4);
1884 newError->info = (CharPtr) MemNew (strlen (seqId) + 256);
1885 sprintf (newError->info,
1886 "Sequence %s is longer (%d characters) than the preceding"
1887 " sequences (%d characters)", seqId, seqLength, prevSeqLength);
1888 break;
1889 case ERR_OUT_OF_MEMORY :
1890 newError->info = (CharPtr) MemNew (80);
1891 sprintf (newError->info, "Out of memory -- memory allocation failed");
1892 break;
1893 case ERR_ID_NO_PRECEDING_SEQ :
1894 seqId = va_arg (argPtr, CharPtr);
1895 newError->info = (CharPtr) MemNew (strlen (seqId) + 100);
1896 sprintf (newError->info,
1897 "ID %s is probably invalid -- it is not immediately"
1898 " preceded by a sequence", seqId);
1899 break;
1900 case ERR_NOT_INTERLEAVED :
1901 newError->info = (CharPtr) MemNew (80);
1902 sprintf (newError->info, "File is declared to be interleaved,"
1903 " but is contiguous");
1904 break;
1905 case ERR_NOT_CONTIGUOUS :
1906 newError->info = (CharPtr) MemNew (80);
1907 sprintf (newError->info, "File is declared to be contiguous,"
1908 " but is interleaved");
1909 break;
1910 case ERR_NO_SEQUENCES_FOUND :
1911 newError->info = (CharPtr) MemNew (80);
1912 sprintf (newError->info, "No sequences were found in the file");
1913 break;
1914 case ERR_ID_COUNT_MISMATCH :
1915 foundCount = va_arg (argPtr, Int4);
1916 declaredCount = va_arg (argPtr, Int4);
1917 newError->info = (CharPtr) MemNew (128);
1918 sprintf (newError->info, "The number of sequences found (%d) doesn't"
1919 " match the number declared (%d)", foundCount, declaredCount);
1920 break;
1921 case ERR_SEQ_LENGTH_MISMATCH :
1922 foundLen = va_arg (argPtr, Int4);
1923 declaredLen = va_arg (argPtr, Int4);
1924 newError->info = (CharPtr) MemNew (128);
1925 sprintf (newError->info, "The length (%d) of the sequences found doesn't"
1926 " match the declared length (%d)", foundLen, declaredLen);
1927 break;
1928 case ERR_DEFLINE_WITH_NO_ID :
1929 defLineStr = va_arg (argPtr, CharPtr);
1930 newError->info = (CharPtr) MemNew (strlen (defLineStr) + 100);
1931 sprintf (newError->info, "Unable to match the following definition"
1932 " line to any sequence :\n%s", defLineStr);
1933 break;
1934 case ERR_ID_WITH_NO_DEFLINE :
1935 seqId = va_arg (argPtr, CharPtr);
1936 newError->info = (CharPtr) MemNew (strlen (seqId) + 80);
1937 sprintf (newError->info, "Could not find a defline for the following"
1938 " sequence :\n%s", seqId);
1939 break;
1940 case ERR_INVALID_DEFLINE :
1941 defLineStr = va_arg (argPtr, CharPtr);
1942 invalidChar = va_arg (argPtr, Int4);
1943 newError->info = (CharPtr) MemNew (strlen (defLineStr) + 100);
1944 sprintf (newError->info, "Invalid definitions line (illegal char '%c'):\n%s",
1945 (Char) invalidChar, defLineStr);
1946 break;
1947 case ERR_DEFLINE_NODEFS :
1948 defLineStr = va_arg (argPtr, CharPtr);
1949 newError->info = (CharPtr) MemNew (strlen (defLineStr) + 100);
1950 sprintf (newError->info, "There is no source info enclosed by"
1951 " brackets on the definition line :\n%s", defLineStr);
1952 break;
1953 case ERR_GLOBAL_DEFLINE_NODEFS :
1954 sequenceCount = va_arg (argPtr, Int4);
1955 newError->info = (CharPtr) MemNew (128);
1956 sprintf (newError->info, "All %d of the file's definition lines are"
1957 " missing source info enclosed in [] brackets", sequenceCount);
1958 newError->level = LEVEL_MULTI;
1959 break;
1960 case ERR_MULTI_DEFLINE_NODEFS :
1961 errorCount = va_arg (argPtr, Int4);
1962 newError->info = (CharPtr) MemNew (128);
1963 sprintf (newError->info, "%d of the file's definition lines are"
1964 " missing source info enclosed in [] brackets", errorCount);
1965 newError->level = LEVEL_MULTI;
1966 break;
1967 default:
1968 newError->info = (CharPtr) MemNew (32);
1969 sprintf (newError->info, "Unknown Error");
1970 break;
1971 }
1972
1973 va_end (argPtr);
1974
1975 /* Add it to the end of the linked list */
1976
1977 if (*errorListPtr == NULL)
1978 *errorListPtr = newError;
1979 else
1980 {
1981 lastError = *errorListPtr;
1982 while (lastError->next != NULL)
1983 lastError = lastError->next;
1984 lastError->next = newError;
1985 }
1986
1987 /* Return a pointer to new record for easy access */
1988
1989 return newError;
1990 }
1991
1992 /*=========================================================================*/
1993 /* */
1994 /* s_AnalyzeContents () - */
1995 /* */
1996 /*=========================================================================*/
1997
1998 static Boolean s_AnalyzeContents (ValNodePtr rowList,
1999 AlignFileDataPtr fileInfoPtr)
2000 {
2001 Int2 idCount;
2002 Boolean result;
2003
2004 if (s_IsInterleaved (rowList, &idCount))
2005 {
2006 fileInfoPtr->info->contigOrInter = ALI_INTERLEAVED;
2007 result = s_AnalyzeInterleaved (rowList, fileInfoPtr, idCount);
2008 }
2009 else
2010 {
2011 fileInfoPtr->info->contigOrInter = ALI_CONTIGUOUS;
2012 result = s_AnalyzeContiguous (rowList, fileInfoPtr);
2013 }
2014
2015 return result;
2016 }
2017
2018 /*=========================================================================*/
2019 /* */
2020 /* SeqLineReEval () - Re-evaluate a line after forcing the first 'word' */
2021 /* to be an ID. */
2022 /* */
2023 /*=========================================================================*/
2024
2025 SeqLineInfoPtr SeqLineReEval (SeqLineInfoPtr seqLinePtr)
2026 {
2027 CharPtr seqStr;
2028 CharPtr idStr;
2029 CharPtr oldStr;
2030 SeqLineInfoPtr newSeqLinePtr;
2031
2032 /* If the line is already split up, */
2033 /* then this won't work. */
2034
2035 if ((seqLinePtr->sequence != NULL) && (seqLinePtr->id != NULL))
2036 return NULL;
2037
2038 /* Determine the string that we're splitting up */
2039
2040 if (seqLinePtr->sequence != NULL)
2041 oldStr = seqLinePtr->sequence;
2042 else if (seqLinePtr->id != NULL)
2043 oldStr = seqLinePtr->id;
2044 else
2045 return NULL;
2046
2047 /* If there's only one 'word' then */
2048 /* we can't split it. */
2049
2050 if (StringLen (oldStr) == seqLinePtr->firstWordLen)
2051 return NULL;
2052
2053 /* Allocate mem for the new strings */
2054
2055 seqStr = (CharPtr) MemNew (StringLen (oldStr) -
2056 seqLinePtr->firstWordLen + 1);
2057 if (seqStr == NULL)
2058 return NULL;
2059 idStr = (CharPtr) MemNew (seqLinePtr->firstWordLen + 1);
2060 if (idStr == NULL)
2061 {
2062 MemFree (seqStr);
2063 return NULL;
2064 }
2065
2066 /* Break up the existing string */
2067
2068 StringNCpy(idStr, oldStr, seqLinePtr->firstWordLen);
2069 idStr[seqLinePtr->firstWordLen] = '\0';
2070 StringCpy(seqStr, oldStr + seqLinePtr->firstWordLen);
2071
2072 /* Return successfully */
2073
2074 newSeqLinePtr = (SeqLineInfoPtr) MemNew (sizeof (SeqLineInfo));
2075 newSeqLinePtr->sequence = seqStr;
2076 newSeqLinePtr->id = idStr;
2077 newSeqLinePtr->rowNum = seqLinePtr->rowNum;
2078 newSeqLinePtr->type = Ali_SeqLineGetType(seqStr, &s_configInfo);
2079 newSeqLinePtr->maybe = seqLinePtr->maybe;
2080 newSeqLinePtr->firstWordLen = seqLinePtr->firstWordLen;
2081
2082 return newSeqLinePtr;
2083 }
2084
2085 /*=========================================================================*/
2086 /* */
2087 /* s_IsExistingId () -- Determine if the given ID is one that has already */
2088 /* been added to the linked list of IDs. */
2089 /* */
2090 /*=========================================================================*/
2091
2092 static Boolean s_IsExistingId (AlignFileDataPtr fileInfoPtr,
2093 CharPtr testIdStr)
2094 {
2095 IdInfoPtr idListPtr = NULL;
2096
2097 /* See if this ID already exists */
2098
2099 idListPtr = fileInfoPtr->sequences;
2100 while (idListPtr != NULL)
2101 {
2102 if (StringCmp(idListPtr->id,testIdStr) == 0)
2103 return TRUE;
2104 idListPtr = idListPtr->next;
2105 }
2106
2107 /* If we made it to here, then */
2108 /* the ID wasn't found. */
2109
2110 return FALSE;
2111 }
2112
2113 /*=========================================================================*/
2114 /* */
2115 /* s_CheckContext () */
2116 /* */
2117 /*=========================================================================*/
2118
2119 static Boolean s_CheckContext (ValNodePtr rowList,
2120 AlignFileDataPtr fileInfoPtr)
2121 {
2122 ValNodePtr currentRow;
2123 ValNodePtr lastRow;
2124 CharPtr idStr;
2125 SeqLineInfoPtr seqLinePtr;
2126 SeqLineInfoPtr reEvalSeqPtr;
2127 SeqLineInfoPtr prevSeqLinePtr = NULL;
2128 DefLineInfoPtr defLinePtr;
2129 OtherLineInfoPtr otherLinePtr;
2130 Int2 patternSeqType;
2131 ErrInfoPtr errPtr;
2132 Boolean changesMade;
2133 Int4 currLen;
2134 Int4 prevLen;
2135
2136 do /* Until no changes are made */
2137 {
2138 currentRow = rowList;
2139 lastRow = NULL;
2140 patternSeqType = ALI_AMBIGUOUS;
2141
2142 changesMade = FALSE;
2143 while (currentRow != NULL)
2144 {
2145 if (currentRow->choice == ALI_SEQLINE)
2146 {
2147 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
2148
2149 if ((seqLinePtr->maybe == FALSE) ||
2150 (seqLinePtr->maybe == TRUE) &&
2151 (s_configInfo.useMaybes == TRUE))
2152 {
2153 /* If there is an ID, make sure that it */
2154 /* immediately precedes a sequence line. */
2155
2156 if (seqLinePtr->id != NULL)
2157 {
2158 if ((seqLinePtr->sequence == NULL) &&
2159 (s_GetRowSeqString(currentRow->next) == NULL))
2160 {
2161 Ali_ChangeRowToOther (currentRow);
2162 changesMade = TRUE;
2163 continue;
2164 }
2165 }
2166
2167 /* Check for an ID that was accidentally lumped */
2168 /* in with a sequence due to being composed */
2169 /* entirely of sequence characters. */
2170
2171 if (prevSeqLinePtr != NULL)
2172 {
2173 currLen = StringLen (seqLinePtr->sequence);
2174 prevLen = StringLen (prevSeqLinePtr->sequence);
2175
2176 if ((currLen > prevLen) &&
2177 (seqLinePtr->id == NULL) &&
2178 (prevSeqLinePtr->id != NULL))
2179 {
2180 reEvalSeqPtr = SeqLineReEval (seqLinePtr);
2181
2182 if (reEvalSeqPtr != NULL)
2183 {
2184 currLen = StringLen (reEvalSeqPtr->sequence);
2185
2186 /* If the new seqline fits better, use it */
2187
2188 if (currLen == prevLen)
2189 {
2190 MemFree(seqLinePtr->sequence);
2191 MemFree(seqLinePtr->id);
2192 MemFree(seqLinePtr);
2193 currentRow->data.ptrvalue = reEvalSeqPtr;
2194 continue;
2195 }
2196 else
2197 {
2198 MemFree(reEvalSeqPtr->sequence);
2199 MemFree(reEvalSeqPtr->id);
2200 MemFree(reEvalSeqPtr);
2201 }
2202 }
2203 }
2204 }
2205
2206 /* If there's an established pattern of sequence */
2207 /* type, then match the current line against it. */
2208 /* Otherwise, set the pattern. */
2209
2210 if (seqLinePtr->type != ALI_AMBIGUOUS)
2211 {
2212 if (patternSeqType != ALI_AMBIGUOUS)
2213 {
2214 if (patternSeqType != seqLinePtr->type)
2215 {
2216 reEvalSeqPtr = SeqLineReEval (seqLinePtr);
2217 if ((reEvalSeqPtr == NULL) ||
2218 ((reEvalSeqPtr != NULL) &&
2219 (patternSeqType != reEvalSeqPtr->type)))
2220 {
2221 if (reEvalSeqPtr != NULL)
2222 {
2223 MemFree(reEvalSeqPtr->sequence);
2224 MemFree(reEvalSeqPtr->id);
2225 MemFree(reEvalSeqPtr);
2226 }
2227 Ali_ChangeRowToOther (currentRow);
2228 changesMade = TRUE;
2229 continue;
2230 }
2231 else
2232 {
2233 MemFree(seqLinePtr->sequence);
2234 MemFree(seqLinePtr->id);
2235 MemFree(seqLinePtr);
2236 currentRow->data.ptrvalue = reEvalSeqPtr;
2237 continue;
2238 }
2239 }
2240 }
2241 else
2242 patternSeqType = seqLinePtr->type;
2243 }
2244
2245 /* */
2246
2247 prevSeqLinePtr = seqLinePtr;
2248
2249 }
2250 lastRow = currentRow;
2251 }
2252
2253 else if (currentRow->choice == ALI_DEFLINE)
2254 {
2255 /* If there is an ID, make sure that it */
2256 /* immediately precedes a sequence line. */
2257
2258 defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue;
2259 if ((defLinePtr->id != NULL) &&
2260 (s_IsExistingId(fileInfoPtr, defLinePtr->id) == FALSE) &&
2261 (s_GetRowSeqString(currentRow->next) == NULL))
2262 {
2263 Ali_ChangeRowToOther (currentRow);
2264 changesMade = TRUE;
2265 continue;
2266 }
2267 lastRow = currentRow;
2268 }
2269
2270 else if (currentRow->choice == ALI_OTHERLINE)
2271 {
2272 /* If there is an ID, make sure that it */
2273 /* immediately precedes a sequence line. */
2274
2275 otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue;
2276 if (otherLinePtr->id != NULL)
2277 {
2278 if (s_GetRowSeqString(currentRow->next) == NULL)
2279 {
2280 otherLinePtr->other = otherLinePtr->id;
2281 otherLinePtr->id = NULL;
2282 changesMade = TRUE;
2283 }
2284 }
2285 lastRow = currentRow;
2286 }
2287
2288 currentRow = currentRow->next;
2289 }
2290 } while (changesMade == TRUE);
2291
2292 /* Check for a dangling ID */
2293
2294 if ((lastRow != NULL) &&
2295 ((idStr = s_GetRowIdString (lastRow)) != NULL) &&
2296 (s_GetRowSeqString (lastRow) == NULL))
2297 {
2298 errPtr = Ali_AddError (&(fileInfoPtr->errors),
2299 ERR_ID_WITHOUT_SEQ,
2300 idStr);
2301 return FALSE;
2302 }
2303
2304 /* Return successfully */
2305
2306 return TRUE;
2307 }
2308
2309 /*=========================================================================*/
2310 /* */
2311 /* s_SortErrors () -- Sort errors by level, so that the most severe appear */
2312 /* first. */
2313 /* */
2314 /* NOTE : Does a lame bubblesort, which nevertheless should be fast */
2315 /* enough for the relatively small linked lists we're dealing */
2316 /* with here. */
2317 /* */
2318 /*=========================================================================*/
2319
2320 static void s_SortErrors (AlignFileDataPtr fileInfoPtr)
2321 {
2322 Boolean swapMade = TRUE;
2323 ErrInfoPtr prevPtr = NULL;
2324 ErrInfoPtr nextPtr = NULL;
2325 ErrInfoPtr errPtr = NULL;
2326
2327 while (swapMade == TRUE)
2328 {
2329 swapMade = FALSE;
2330 errPtr = fileInfoPtr->errors;
2331 while (errPtr->next != NULL)
2332 {
2333 nextPtr = errPtr->next;
2334 if (errPtr->level > nextPtr->level)
2335 {
2336 swapMade = TRUE;
2337
2338 /* Remove the error from the list */
2339
2340 if (errPtr == fileInfoPtr->errors)
2341 fileInfoPtr->errors = nextPtr;
2342 else
2343 prevPtr->next = nextPtr;
2344
2345 /* Then re-insert it after the following error */
2346
2347 errPtr->next = nextPtr->next;
2348 nextPtr->next = errPtr;
2349
2350 /* The old next error is now the previous error */
2351
2352 prevPtr = nextPtr;
2353 }
2354 else
2355 {
2356 prevPtr = errPtr;
2357 errPtr = errPtr->next;
2358 }
2359 }
2360 }
2361
2362 return;
2363 }
2364
2365 /*=========================================================================*/
2366 /* */
2367 /* s_ReplaceUWithT () -- Replace all the Us in a nucleotide sequence with */
2368 /* Ns. */
2369 /* */
2370 /*=========================================================================*/
2371
2372 static void s_ReplaceUWithT (AlignFileDataPtr fileInfoPtr)
2373 {
2374 IdInfoPtr seqPtr = NULL;
2375 SeqPartPtr seqPart = NULL;
2376 CharPtr seqString;
2377 Int4 i;
2378
2379 seqPtr = fileInfoPtr->sequences;
2380 while (seqPtr != NULL)
2381 {
2382 seqPart = seqPtr->sequence;
2383 while (seqPart != NULL)
2384 {
2385 seqString = seqPart->sequence;
2386 for (i = 0; seqString[i] != '\0'; i++)
2387 if (seqString[i] == 'U')
2388 seqString[i] = 'T';
2389 else if (seqString[i] == 'u')
2390 seqString[i] = 't';
2391 seqPart = seqPart->next;
2392 }
2393 seqPtr = seqPtr->next;
2394 }
2395
2396 }
2397
2398 /*=========================================================================*/
2399 /* */
2400 /* s_AnalyzeErrors () -- Look for patterns in the errors that can be used */
2401 /* to create more general, higher-level errors */
2402 /* instead. */
2403 /* */
2404 /*=========================================================================*/
2405
2406 static void s_AnalyzeErrors (AlignFileDataPtr fileInfoPtr)
2407 {
2408 Int4 seqCount = 0;
2409 Int4 defCount = 0;
2410 Int4 errCount = 0;
2411 IdInfoPtr seqPtr = NULL;
2412 ErrInfoPtr errPtr = NULL;
2413 ErrInfoPtr prevErrPtr = NULL;
2414 ErrInfoPtr nextErrPtr = NULL;
2415
2416 if (fileInfoPtr->errors == NULL)
2417 return;
2418
2419 /* Get counts of sequences and deflines */
2420
2421 seqPtr = fileInfoPtr->sequences;
2422 while (seqPtr != NULL)
2423 {
2424 seqCount++;
2425 if (seqPtr->defline != NULL)
2426 defCount++;
2427 seqPtr = seqPtr->next;
2428 }
2429
2430 /* Check for "missing bracket" defline errors */
2431
2432 errPtr = fileInfoPtr->errors;
2433 while (errPtr != NULL)
2434 {
2435 if (errPtr->errNum == ERR_DEFLINE_NODEFS)
2436 errCount++;
2437 errPtr = errPtr->next;
2438 }
2439
2440 /* If ALL deflines have missing bracket errors */
2441 /* then replace the msgs with one global msg */
2442
2443 if (errCount == seqCount)
2444 {
2445 if (s_configInfo.errExpandLevel != ALI_ERRMSG_EXPAND_ALL)
2446 {
2447 errPtr = fileInfoPtr->errors;
2448 while (errPtr != NULL)
2449 {
2450 nextErrPtr = errPtr->next;
2451 if (errPtr->errNum == ERR_DEFLINE_NODEFS)
2452 {
2453 if (errPtr == fileInfoPtr->errors)
2454 {
2455 fileInfoPtr->errors = fileInfoPtr->errors->next;
2456 s_FreeErrorNode (errPtr);
2457 errPtr = NULL;
2458 }
2459 else
2460 {
2461 prevErrPtr->next = nextErrPtr;
2462 s_FreeErrorNode (errPtr);
2463 errPtr = NULL;
2464 }
2465 }
2466 else
2467 prevErrPtr = errPtr;
2468 errPtr = nextErrPtr;
2469 }
2470 }
2471 Ali_AddError (&(fileInfoPtr->errors), ERR_GLOBAL_DEFLINE_NODEFS,
2472 seqCount);
2473 }
2474
2475 /* If SOME deflines have missing bracket errors */
2476 /* then replace the msgs with one global msg */
2477
2478 else if (errCount > 1)
2479 {
2480 if (s_configInfo.errExpandLevel == ALI_ERRMSG_EXPAND_NONE)
2481 {
2482 errPtr = fileInfoPtr->errors;
2483 while (errPtr != NULL)
2484 {
2485 nextErrPtr = errPtr->next;
2486 if (errPtr->errNum == ERR_DEFLINE_NODEFS)
2487 {
2488 if (errPtr == fileInfoPtr->errors)
2489 {
2490 fileInfoPtr->errors = fileInfoPtr->errors->next;
2491 s_FreeErrorNode (errPtr);
2492 errPtr = NULL;
2493 }
2494 else
2495 {
2496 prevErrPtr->next = nextErrPtr;
2497 s_FreeErrorNode (errPtr);
2498 errPtr = NULL;
2499 }
2500 }
2501 else
2502 prevErrPtr = errPtr;
2503 errPtr = nextErrPtr;
2504 }
2505 }
2506 Ali_AddError (&(fileInfoPtr->errors), ERR_MULTI_DEFLINE_NODEFS,
2507 errCount);
2508 }
2509
2510 /* Finally, sort the errors by type */
2511
2512 s_SortErrors (fileInfoPtr);
2513
2514 return;
2515
2516 }
2517
2518 /*=========================================================================*/
2519 /* */
2520 /* Ali_Read () */
2521 /* */
2522 /*=========================================================================*/
2523
2524 AlignFileDataPtr Ali_Read (FILE PNTR alignFilePtr)
2525 {
2526 ValNodePtr rowList = NULL;
2527 AlignFileDataPtr fileInfoPtr;
2528 ErrInfoPtr errorList = NULL;
2529
2530 /* Check parameters */
2531
2532 if (alignFilePtr == NULL)
2533 return FALSE;
2534
2535 /* Initialize */
2536
2537 fileInfoPtr = (AlignFileDataPtr) MemNew (sizeof(AlignFileData));
2538 fileInfoPtr->sequences = NULL;
2539 fileInfoPtr->maybes = NULL;
2540 fileInfoPtr->errors = NULL;
2541 fileInfoPtr->info = (ParsedInfoPtr) MemNew (sizeof (ParsedInfo));
2542 if (fileInfoPtr->info == NULL)
2543 {
2544 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
2545 Ali_Free (fileInfoPtr);
2546 return NULL;
2547 }
2548 fileInfoPtr->info->missingChar = NULL;
2549 fileInfoPtr->info->gapChar = NULL;
2550 fileInfoPtr->info->unalignedChar = NULL;
2551
2552 if (s_configurationSet == FALSE)
2553 Ali_SetConfig (NULL, ALI_SET_DEFAULTS);
2554
2555 /* Read in and parse each row */
2556
2557 rowList = Ali_ReadLines (alignFilePtr, &errorList, &s_configInfo, fileInfoPtr);
2558 fileInfoPtr->errors = errorList;
2559
2560 if (rowList == NULL)
2561 return fileInfoPtr;
2562
2563 /* Make first pass to adjust the rows based on context */
2564
2565 if (s_CheckContext(rowList, fileInfoPtr) != TRUE)
2566 return fileInfoPtr;
2567
2568 s_DisplayRowList (rowList, s_configInfo.debugLevel);
2569
2570 /* Analyze the IDs and sequences for consistancy */
2571
2572 s_AnalyzeContents (rowList, fileInfoPtr);
2573
2574 if (fileInfoPtr->sequences == NULL)
2575 Ali_AddError (&(fileInfoPtr->errors), ERR_NO_SEQUENCES_FOUND);
2576
2577 /* Analyze the errors to see if they can be */
2578 /* combined into more general global errors */
2579
2580 if (fileInfoPtr->errors != NULL)
2581 s_AnalyzeErrors (fileInfoPtr);
2582
2583 /* Return the missing, gap, and unaligned chars used */
2584
2585 fileInfoPtr->info->missingChar = (CharPtr) MemNew(16);
2586 StringCpy (fileInfoPtr->info->missingChar, s_configInfo.missingChar);
2587
2588 fileInfoPtr->info->gapChar = (CharPtr) MemNew(16);
2589 StringCpy (fileInfoPtr->info->gapChar, s_configInfo.gapChar);
2590
2591 fileInfoPtr->info->unalignedChar = (CharPtr) MemNew(16);
2592 StringCpy (fileInfoPtr->info->unalignedChar, s_configInfo.unalignedChar);
2593
2594 /* If these are nucleotide sequences, then */
2595 /* replace all 'U's with 'T's. */
2596
2597 s_ReplaceUWithT (fileInfoPtr);
2598
2599 /* Clean up and return successfully */
2600
2601 s_FreeRowList_Safe (rowList);
2602 return fileInfoPtr;
2603 }
2604
2605 /***************************************************************************
2606 *
2607 * section to convert AlignFileDataPtr content into seqalign/seqentry
2608 * structures
2609 *
2610 ***************************************************************************/
2611 typedef struct tinyinfo {
2612 Int4 n;
2613 struct tinyinfo PNTR next;
2614 } ALI_TinyInfo, PNTR ALI_TinyInfoPtr;
2615
2616
2617 static Boolean is_gap_char(Char c, CharPtr gapChar)
2618 {
2619 if (StrChr(gapChar, c) != NULL)
2620 return TRUE;
2621 return FALSE;
2622 }
2623
2624 static int LIBCALLBACK ALI_SortTips(VoidPtr ptr1, VoidPtr ptr2)
2625 {
2626 ALI_TinyInfoPtr tip1;
2627 ALI_TinyInfoPtr tip2;
2628
2629 tip1 = *((ALI_TinyInfoPtr PNTR)ptr1);
2630 tip2 = *((ALI_TinyInfoPtr PNTR)ptr2);
2631 if (tip1->n > tip2->n)
2632 return 1;
2633 if (tip1->n < tip2->n)
2634 return -1;
2635 return 0;
2636 }
2637
2638 static Boolean is_valid_seq(Char c, CharPtr missingChar, CharPtr gapChar)
2639 {
2640 if (StrChr("\0", c))
2641 return FALSE;
2642 if (StrChr(missingChar, c) != NULL)
2643 return TRUE;
2644 if (StrChr(gapChar, c) != NULL)
2645 return TRUE;
2646 if (IS_ALPHA(c))
2647 return TRUE;
2648 if (c == '-')
2649 return TRUE;
2650 if (c == '?')
2651 return TRUE;
2652 return FALSE;
2653 }
2654
2655 static Boolean is_missing(Char c, CharPtr missingChar)
2656 {
2657 if (StrChr(missingChar, c) != NULL)
2658 return TRUE;
2659 else
2660 return FALSE;
2661 }
2662
2663 static SeqAlignPtr ALI_MakeSeqAlign(AlignFileDataPtr afp, CharPtr PNTR PNTR stringsptr, Int4Ptr numseq, CharPtr PNTR PNTR deflineptr)
2664 {
2665 Int4 alnlen;
2666 CharPtr buf;
2667 CharPtr c;
2668 Int4 ctr;
2669 Int4 ctr_prev;
2670 CharPtr PNTR deflines;
2671 DenseSegPtr dsp;
2672 Int4 i;
2673 IdInfoPtr id_head;
2674 IdInfoPtr iip;
2675 Boolean ingap;
2676 Boolean isgap;
2677 Int4 j;
2678 Int4 last;
2679 Int4 len;
2680 Int4 maxlen;
2681 Int4 numtips;
2682 SeqAlignPtr sap;
2683 SeqPartPtr seq;
2684 SeqIdPtr sip;
2685 SeqIdPtr sip_prev;
2686 CharPtr PNTR strings;
2687 Char text[100];
2688 ALI_TinyInfoPtr tip;
2689 ALI_TinyInfoPtr tip_head;
2690 ALI_TinyInfoPtr tip_prev;
2691 ALI_TinyInfoPtr PNTR tiparray;
2692
2693 if (afp->info == NULL)
2694 {
2695 ErrPostEx(SEV_ERROR, 0, 0, "NULL afp->info -- alignment not read correctly\n");
2696 return NULL;
2697 }
2698 i = 0;
2699 id_head = afp->sequences;
2700 iip = id_head;
2701 while (iip != NULL)
2702 {
2703 i++;
2704 if (iip->id == NULL)
2705 {
2706 sprintf(text, "No id read for sequence %d\n", i);
2707 ErrPostEx(SEV_ERROR, 0, 0, text);
2708 return NULL;
2709 }
2710 iip = iip->next;
2711 }
2712 sap = SeqAlignNew();
2713 sap->type = SAT_PARTIAL;
2714 sap->segtype = SAS_DENSEG;
2715 sap->dim = i;
2716 dsp = DenseSegNew();
2717 dsp->dim = i;
2718 strings = (CharPtr PNTR)MemNew(i*sizeof(CharPtr));
2719 deflines = (CharPtr PNTR)MemNew(i*sizeof(CharPtr));
2720 tip_head = tip_prev = NULL;
2721 iip = id_head;
2722 maxlen = 0;
2723 tip_head = tip_prev = NULL;
2724 numtips = 0;
2725 alnlen = 0;
2726 i = 1;
2727 while (iip != NULL)
2728 {
2729 len = 0;
2730 ctr = 0;
2731 seq = iip->sequence;
2732 if (seq == NULL || seq->sequence == NULL)
2733 {
2734 sprintf(text, "Error in reading sequence %d -- no sequence characters read\n", i);
2735 ErrPostEx(SEV_ERROR, 0, 0, text);
2736 return NULL;
2737 }
2738 c = seq->sequence;
2739 if (is_gap_char(*c, afp->info->gapChar))
2740 ingap = TRUE;
2741 else
2742 ingap = FALSE;
2743 while (seq != NULL)
2744 {
2745 c = seq->sequence;
2746 if (c == NULL)
2747 {
2748 sprintf(text, "Error in reading sequence %d -- no sequence characters read\n", i);
2749 ErrPostEx(SEV_ERROR, 0, 0, text);
2750 return NULL;
2751 }
2752 while (is_valid_seq(*c, afp->info->missingChar, afp->info->gapChar))
2753 {
2754 if (is_gap_char(*c, afp->info->gapChar) && !ingap)
2755 {
2756 tip = (ALI_TinyInfoPtr)MemNew(sizeof(ALI_TinyInfo));
2757 tip->n = ctr;
2758 if (tip_head != NULL)
2759 {
2760 tip_prev->next = tip;
2761 tip_prev = tip;
2762 } else
2763 tip_head = tip_prev = tip;
2764 ingap = TRUE;
2765 numtips++;
2766 } else if (!is_gap_char(*c, afp->info->gapChar) && ingap)
2767 {
2768 tip = (ALI_TinyInfoPtr)MemNew(sizeof(ALI_TinyInfo));
2769 tip->n = ctr;
2770 if (tip_head != NULL)
2771 {
2772 tip_prev->next = tip;
2773 tip_prev = tip;
2774 } else
2775 tip_head = tip_prev = tip;
2776 ingap = FALSE;
2777 numtips++;
2778 }
2779 if (!is_gap_char(*c, afp->info->gapChar))
2780 len++;
2781 ctr++;
2782 c++;
2783 }
2784 seq = seq->next;
2785 }
2786 if (ctr > alnlen)
2787 alnlen = ctr;
2788 if (len > maxlen)
2789 maxlen = len;
2790 iip = iip->next;
2791 i++;
2792 }
2793 if (tip_head == NULL) /* this is a gapless alignment */
2794 {
2795 dsp->numseg = 1;
2796 dsp->starts = (Int4Ptr)MemNew((dsp->dim)*sizeof(Int4));
2797 dsp->lens = (Int4Ptr)MemNew(sizeof(Int4));
2798 dsp->strands = (Uint1Ptr)MemNew((dsp->dim)*sizeof(Uint1));
2799 for (i=0; i<dsp->dim; i++)
2800 {
2801 dsp->strands[i] = Seq_strand_plus;
2802 }
2803 dsp->lens[0] = id_head->length;
2804 /* all the starts are 0 anyway, just leave them and get the ids & seqs */
2805 iip = id_head;
2806 sip_prev = NULL;
2807 buf = (CharPtr)MemNew((maxlen+1)*sizeof(Char));
2808 i = 0;
2809 while (iip != NULL)
2810 {
2811 sip = MakeSeqID(iip->id);
2812 deflines[i] = StringSave(iip->defline);
2813 if (sip_prev != NULL)
2814 {
2815 sip_prev->next = sip;
2816 sip_prev = sip;
2817 } else
2818 dsp->ids = sip_prev = sip;
2819 seq = iip->sequence;
2820 for (ctr = 0; ctr<(maxlen+1); ctr++)
2821 {
2822 buf[ctr] = '\0';
2823 }
2824 ctr = 0;
2825 while (seq != NULL)
2826 {
2827 c = seq->sequence;
2828 while (is_valid_seq(*c, afp->info->missingChar, afp->info->gapChar))
2829 {
2830 if (is_missing(*c, afp->info->missingChar))
2831 buf[ctr] = 'N';
2832 else
2833 buf[ctr] = *c;
2834 ctr++;
2835 c++;
2836 }
2837 seq = seq->next;
2838 }
2839 strings[i] = StringSave(buf);
2840 iip = iip->next;
2841 i++;
2842 }
2843 sap->segs = (Pointer)dsp;
2844 MemFree(buf);
2845 *numseq = dsp->dim;
2846 *stringsptr = strings;
2847 *deflineptr = deflines;
2848 return sap;
2849 }
2850 /* now all the segment boundaries have been collected, so sort them */
2851 tiparray = (ALI_TinyInfoPtr PNTR)MemNew(numtips*sizeof(ALI_TinyInfoPtr));
2852 i = 0;
2853 tip = tip_head;
2854 while (tip != NULL)
2855 {
2856 tiparray[i] = tip;
2857 i++;
2858 tip = tip->next;
2859 }
2860 HeapSort(tiparray, numtips, sizeof(ALI_TinyInfoPtr), ALI_SortTips);
2861 dsp->numseg = 2; /* one for the first, one for the last */
2862 for (i=1; i<numtips; i++)
2863 {
2864 if (tiparray[i]->n != tiparray[i-1]->n)
2865 dsp->numseg++;
2866 }
2867 dsp->starts = (Int4Ptr)MemNew((dsp->dim)*(dsp->numseg)*sizeof(Int4));
2868 dsp->lens = (Int4Ptr)MemNew((dsp->numseg)*sizeof(Int4));
2869 last = 0;
2870 j=0;
2871 dsp->lens[0] = tiparray[0]->n;
2872 last = tiparray[0]->n;
2873 j++;
2874 for (i=1; i<numtips; i++)
2875 {
2876 if (tiparray[i]->n != tiparray[i-1]->n)
2877 {
2878 dsp->lens[j] = tiparray[i]->n-last;
2879 last = tiparray[i]->n;
2880 j++;
2881 }
2882 }
2883 dsp->lens[j] = alnlen - last;
2884 dsp->strands = (Uint1Ptr)MemNew((dsp->dim)*(dsp->numseg)*sizeof(Uint1));
2885 /* do we have any strand info to the contrary? */
2886 for (i=0; i<(dsp->dim)*(dsp->numseg); i++)
2887 {
2888 dsp->strands[i] = Seq_strand_plus;
2889 }
2890 iip = id_head;
2891 i = 0;
2892 buf = (CharPtr)MemNew((maxlen+1)*sizeof(Char));
2893 sip_prev = NULL;
2894 while (iip != NULL)
2895 {
2896 j = 0;
2897 for (ctr = 0; ctr<(maxlen+1); ctr++)
2898 {
2899 buf[ctr] = '\0';
2900 }
2901 sip = MakeSeqID(iip->id);
2902 SeqIdSetFree(sip->next);
2903 sip->next = NULL;
2904 deflines[i] = StringSave(iip->defline);
2905 if (sip_prev != NULL)
2906 {
2907 sip_prev->next = sip;
2908 sip_prev = sip;
2909 } else
2910 dsp->ids = sip_prev = sip;
2911 ctr = 0;
2912 ctr_prev = 0;
2913 len = 0;
2914 seq = iip->sequence;
2915 while (seq != NULL)
2916 {
2917 c = seq->sequence;
2918 while (is_valid_seq(*c, afp->info->missingChar, afp->info->gapChar))
2919 {
2920 isgap = is_gap_char(*c, afp->info->gapChar);
2921 if (!isgap)
2922 {
2923 if (is_missing(*c, afp->info->missingChar))
2924 buf[ctr] = 'N';
2925 else
2926 buf[ctr] = *c;
2927 ctr++;
2928 }
2929 len++;
2930 if (len == dsp->lens[j])
2931 {
2932 if (isgap)
2933 dsp->starts[dsp->dim*j+i] = -1;
2934 else
2935 {
2936 dsp->starts[dsp->dim*j+i] = ctr_prev;
2937 ctr_prev = ctr;
2938 }
2939 j++;
2940 len = 0;
2941 }
2942 if (*(c+1) == '\0' && seq->next == NULL && j < dsp->numseg)
2943 {
2944 if (isgap)
2945 dsp->starts[dsp->dim*j+i] = -1;
2946 else
2947 dsp->starts[dsp->dim*j+i] = ctr_prev;
2948 }
2949 c++;
2950 }
2951 seq = seq->next;
2952 }
2953 strings[i] = StringSave(buf);
2954 iip = iip->next;
2955 i++;
2956 }
2957 sap->segs = (Pointer)dsp;
2958 MemFree(buf);
2959 for (i=0; i<numtips; i++)
2960 {
2961 MemFree(tiparray[i]);
2962 }
2963 MemFree(tiparray);
2964 *numseq = dsp->dim;
2965 *stringsptr = strings;
2966 *deflineptr = deflines;
2967 return sap;
2968 }
2969
2970 static SeqEntryPtr ALI_make_seqentry_for_seqentry (SeqEntryPtr sep)
2971 {
2972 BioseqPtr bsp;
2973 BioseqSetPtr bssp;
2974 SeqEntryPtr sep_new;
2975 SeqEntryPtr sep_tmp;
2976
2977 if (IS_Bioseq(sep) || IS_Bioseq_set(sep))
2978 {
2979 if (sep->next)
2980 {
2981 bssp = BioseqSetNew ();
2982 bssp->_class = 14;
2983 bssp->seq_set = sep;
2984 sep_new = SeqEntryNew ();
2985 sep_new->choice = 2;
2986 sep_new->data.ptrvalue = bssp;
2987 SeqMgrLinkSeqEntry (sep_new, 0, NULL);
2988 sep_tmp = bssp->seq_set;
2989 while (sep_tmp != NULL)
2990 {
2991 if (IS_Bioseq(sep_tmp))
2992 {
2993 bsp = (BioseqPtr)sep_tmp->data.ptrvalue;
2994 ObjMgrConnect (OBJ_BIOSEQ, (Pointer) bsp, OBJ_BIOSEQSET, (Pointer) bssp);
2995 }
2996 sep_tmp = sep_tmp->next;
2997 }
2998 } else
2999 return sep;
3000 }
3001 return sep_new;
3002 }
3003
3004 static Uint1 ALI_GuessMoltype(CharPtr string)
3005 {
3006 CharPtr c;
3007
3008 c = string;
3009 while (*c != '\0')
3010 {
3011 if (StringChr("EFIJLOPQUXZefijlopquxz", *c) != NULL) /* protein */
3012 return Seq_mol_aa;
3013 c++;
3014 }
3015 return Seq_mol_na;
3016 }
3017 static Int4 SPI_MapRowCoords(SeqAlignPtr sap, Int4 from, Int4 to, Int4 row, Uint1 direction)
3018 {
3019 Int4 pos;
3020
3021 if (direction == 1)
3022 {
3023 pos = AlnMgrMapRowCoords(sap, from, row, NULL);
3024 from++;
3025 while (pos < 0 && from <= to)
3026 {
3027 pos = AlnMgrMapRowCoords(sap, from, row, NULL);
3028 from++;
3029 }
3030 } else
3031 {
3032 pos = AlnMgrMapRowCoords(sap, to, row, NULL);
3033 to--;
3034 while (pos < 0 && to >= from)
3035 {
3036 pos = AlnMgrMapRowCoords(sap, to, row, NULL);
3037 to--;
3038 }
3039 }
3040 if (pos < 0)
3041 return -1;
3042 return pos;
3043 }
3044
3045 static CharPtr SPI_WriteAlnLine(Int4 row, Int4 from, Int4 to, SeqAlignPtr sap)
3046 {
3047 AlnMsgPtr amp;
3048 BioseqPtr bsp;
3049 Uint1 buf[65+2];
3050 Int4 ctr;
3051 Int4 i;
3052 Boolean more;
3053 Int4 n;
3054 SeqIdPtr sip;
3055 SeqPortPtr spp;
3056 CharPtr string;
3057
3058 n = AlnMgrGetNumRows(sap);
3059 if (row > n || row < 1)
3060 return NULL;
3061 string = (CharPtr)MemNew((65+2)*sizeof(Char));
3062 for (n=0; n<(65+2); n++)
3063 {
3064 string[n] = '\0';
3065 }
3066 sip = AlnMgrGetNthSeqIdPtr(sap, row);
3067 bsp = BioseqLockById(sip);
3068 amp = AlnMsgNew();
3069 amp->row_num = row;
3070 amp->from_m = from;
3071 amp->to_m = to;
3072 if (amp->to_m < 0)
3073 amp->to_m = -1;
3074 n = 0;
3075 while ((more = AlnMgrGetNextAlnBit(sap, amp)) == TRUE)
3076 {
3077 if (amp->to_b - amp->from_b > amp->to_m - amp->from_m) /* kludge */
3078 {
3079 if (amp->strand == Seq_strand_minus)
3080 amp->from_b = amp->to_b - (amp->to_m - amp->from_m);
3081 else
3082 amp->to_b = amp->from_b + (amp->to_m - amp->from_m);
3083 }
3084 if (amp->gap == 0)
3085 {
3086 spp = SeqPortNew(bsp, amp->from_b, amp->to_b, amp->strand, Seq_code_iupacna);
3087 ctr = SeqPortRead(spp, buf, (amp->to_b - amp->from_b + 1));
3088 SeqPortFree(spp);
3089 for (i=n; i<n+ctr; i++)
3090 {
3091 string[i] = buf[i-n];
3092 }
3093 n += ctr;
3094 } else
3095 {
3096 for (i=n; i<(n+amp->to_b-amp->from_b+1); i++)
3097 {
3098 string[i] = '-';
3099 }
3100 n += amp->to_b-amp->from_b+1;
3101 }
3102 }
3103 AlnMsgFree(amp);
3104 SeqIdFree(sip);
3105 return string;
3106 }
3107 static Int4 spi_get_num_places(Int4 num)
3108 {
3109 FloatHi f;
3110 Int4 i;
3111 Int4 x;
3112
3113 x = 10;
3114 for (i=1; i<21; i++)
3115 {
3116 f = (FloatHi)num/(FloatHi)x;
3117 if (f < 1)
3118 {
3119 if (num < 0)
3120 return (i+1);
3121 else
3122 return i;
3123 }
3124 x = x*10;
3125 }
3126 if (num < 0)
3127 i++;
3128 return i;
3129 }
3130 static void PrintOutMultAlign(SeqAlignPtr sap)
3131 {
3132 Int4 c;
3133 Int4Ptr coord;
3134 Int4 ctr;
3135 Int4 d;
3136 Int4 j;
3137 Int4 len;
3138 Int4 n;
3139 Int4 spacer;
3140 CharPtr PNTR stringptr;
3141
3142 spacer = 12;
3143 AlnMgrIndexSingleChildSeqAlign(sap);
3144 n = AlnMgrGetNumRows(sap);
3145 stringptr = (CharPtr PNTR)MemNew(n*sizeof(CharPtr));
3146 coord = (Int4Ptr)MemNew(n*sizeof(Int4));
3147 len = AlnMgrGetAlnLength(sap, FALSE);
3148 for (c=0; c<len; c+=65-10)
3149 {
3150 for (j=0; j<n; j++)
3151 {
3152 stringptr[j] = SPI_WriteAlnLine(j+1, c, MIN(c+65-10-1, len-1), sap);
3153 coord[j] = SPI_MapRowCoords(sap, c, MIN(c+65-10-1, len-1), j+1, 1);
3154 if (coord[j] >= 0)
3155 coord[j]++;
3156 }
3157 for (j=0; j<n; j++)
3158 {
3159 printf("%d", coord[j]);
3160 d = spi_get_num_places(coord[j]);
3161 for (d; d<spacer; d++)
3162 {
3163 printf(" ");
3164 }
3165 if (j == 0)
3166 printf("%s", stringptr[j]);
3167 else
3168 {
3169 for (ctr=0; ctr<MIN(65-10, len-c); ctr++)
3170 {
3171 if (stringptr[j][ctr] == stringptr[0][ctr])
3172 printf(".");
3173 else
3174 printf("%c", stringptr[j][ctr]);
3175 }
3176 }
3177 printf("\n");
3178 MemFree(stringptr[j]);
3179 }
3180 if (c+65-10 < len)
3181 printf("\n");
3182 }
3183 fflush(stdout);
3184 }
3185
3186 static void PrintOutSegs(SeqAlignPtr sap)
3187 {
3188 DenseSegPtr dsp;
3189 Int4 i;
3190 Int4 j;
3191
3192 dsp = (DenseSegPtr)(sap->segs);
3193 printf("nums:\t");
3194 for (i=0; i<dsp->numseg; i++)
3195 {
3196 printf("%d\t", i+1);
3197 }
3198 printf("\n");
3199 printf("lens:\t");
3200 for (i=0; i<dsp->numseg; i++)
3201 {
3202 printf("%d\t", dsp->lens[i]);
3203 }
3204 printf("\n");
3205 for (i=0; i<dsp->dim; i++)
3206 {
3207 printf("row %d\t", i+1);
3208 for (j=0; j<dsp->numseg; j++)
3209 {
3210 printf("%d\t", dsp->starts[(dsp->dim)*j+i]);
3211 }
3212 printf("\n");
3213 }
3214 fflush(stdout);
3215 }
3216
3217 NLM_EXTERN SeqEntryPtr ALI_ConvertToNCBIData(AlignFileDataPtr afp)
3218 {
3219 BioseqPtr bsp;
3220 CharPtr PNTR deflines;
3221 Int4 i;
3222 Int4 len;
3223 Uint1 moltype;
3224 Int4 numseq;
3225 SeqAnnotPtr sanp;
3226 SeqAlignPtr sap;
3227 SeqDescrPtr sdp;
3228 SeqEntryPtr sep;
3229 SeqEntryPtr sep_head;
3230 SeqEntryPtr sep_prev;
3231 SeqIdPtr sip;
3232 CharPtr str;
3233 CharPtr PNTR strings;
3234
3235 if (afp == NULL || afp->sequences == NULL)
3236 {
3237 ErrPostEx(SEV_ERROR, 0, 0, "NULL Data Passed to ConvertToNCBIData");
3238 return NULL;
3239 }
3240 sap = ALI_MakeSeqAlign(afp, &strings, &numseq, &deflines);
3241 if (sap == NULL)
3242 {
3243 ErrPostEx(SEV_ERROR, 0, 0, "Unable to create seqentry\n");
3244 return NULL;
3245 }
3246 sanp = SeqAnnotNew();
3247 sanp->type = 2;
3248 sanp->data = (Pointer)sap;
3249 moltype = ALI_GuessMoltype(strings[0]);
3250 sip = ((DenseSegPtr)(sap->segs))->ids;
3251 sep_head = sep_prev = NULL;
3252 for (i=0; i<numseq; i++)
3253 {
3254 len = StringLen(strings[i]);
3255 sep = StringToSeqEntry (strings[i], sip, len, moltype);
3256 if (sep != NULL) {
3257 bsp = (BioseqPtr)(sep->data.ptrvalue);
3258 if (! StringHasNoText (deflines[i])) {
3259 str = deflines[i];
3260 sdp = SeqDescrAddPointer(&(bsp->descr), Seq_descr_title, str);
3261 }
3262 if (sep != NULL)
3263 {
3264 if (sep_head != NULL)
3265 {
3266 sep_prev->next = sep;
3267 sep_prev = sep;
3268 } else
3269 sep_head = sep_prev = sep;
3270 }
3271 sip = sip->next;
3272 MemFree(strings[i]);
3273 }
3274 }
3275 sep_head = ALI_make_seqentry_for_seqentry (sep_head);
3276 SeqAlignAddInSeqEntry (sep_head, sanp);
3277 MemFree(strings);
3278 MemFree(deflines);
3279 return sep_head;
3280 }
3281 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |