|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/api/aliread.c |
source navigation diff markup identifier search freetext search file search |
1 /* Include files */
2
3 #include <ncbi.h>
4 #include <objalign.h>
5
6 #include <aliparse.h>
7
8 #ifdef OS_UNIX_DARWIN
9 #define NLM_GETC fgetc
10 #else
11 #define NLM_GETC getc
12 #endif
13
14 /* Defined constants */
15
16 #define ALI_DATA_NUCLEOTIDE 1 /* Values for dataType */
17 #define ALI_DATA_PROTEIN 2 /* field in DataInfo. */
18 #define ALI_MAX_LINE_LEN 256
19
20 /* Function prototypes */
21
22 Boolean IsNucleotideChar (Char ch);
23 Boolean IsProteinChar (Char ch);
24 Boolean IsSequenceChar (Char ch,
25 CharPtr gapChar,
26 CharPtr missingChar,
27 CharPtr unalignedChar);
28 Int2 IsValidIdChar (Char idChar);
29 Boolean IsValidId (CharPtr idStr);
30 CharPtr ReadAlignFileLine (FILE PNTR alignFilePtr,
31 ErrInfoPtr PNTR errorListPtr,
32 AliConfigInfoPtr configPtr,
33 Boolean PNTR isEOF);
34
35
36 static Boolean s_MightBeCorruptSequence (Int4 seqCharCount,
37 CharPtr seqString,
38 AliConfigInfoPtr configPtr);
39 static DefLineInfoPtr s_ParseDefLine (CharPtr lineStr,
40 Int4 rowNum,
41 ErrInfoPtr PNTR errorListPtr);
42 static SeqLineInfoPtr s_ParseSequenceLine (CharPtr lineStr,
43 AliConfigInfoPtr configPtr);
44 CharPtr s_OtherGetValue (CharPtr otherStr);
45 static Boolean s_ProcessOtherLine (AliConfigInfoPtr configPtr,
46 CharPtr otherStr,
47 AlignFileDataPtr fileInfoPtr);
48 static OtherLineInfoPtr s_ParseOtherLine (CharPtr lineStr);
49
50 /*=========================================================================*/
51 /* */
52 /* IsNucleotideChar () */
53 /* */
54 /*=========================================================================*/
55
56 Boolean IsNucleotideChar (Char ch)
57 {
58 if (StringChr("abcdghkmnrstuvwxyABCDGHKMNRSTUVWXY",ch) != NULL)
59 return TRUE;
60 else
61 return FALSE;
62 }
63
64 /*=========================================================================*/
65 /* */
66 /* IsProteinChar () */
67 /* */
68 /*=========================================================================*/
69
70 Boolean IsProteinChar (Char ch)
71 {
72 if (StringChr("ABCDEFGHIKLMNPQRSTUVWXYZ*abcdefghiklmnpqrstuvwxyz",ch) != NULL)
73 return TRUE;
74 else
75 return FALSE;
76 }
77
78 /*=========================================================================*/
79 /* */
80 /* Ali_SeqLineGetType () */
81 /* */
82 /*=========================================================================*/
83
84 Int2 Ali_SeqLineGetType(CharPtr seqLine,
85 AliConfigInfoPtr configPtr)
86 {
87 Int4 position;
88 Int4 nuclCount;
89 Int4 miscCount;
90 FloatLo percentNucl;
91 FloatLo percentMisc;
92 Char commonNucls[20];
93 Char miscChars[5];
94
95 /* Is it definitely a protein sequence? */
96 /* The following chars are only in */
97 /* protein sequences. */
98
99 if ((StringChr (seqLine, 'E')) ||
100 (StringChr (seqLine, 'e')) ||
101 (StringChr (seqLine, 'F')) ||
102 (StringChr (seqLine, 'f')) ||
103 (StringChr (seqLine, 'I')) ||
104 (StringChr (seqLine, 'i')) ||
105 (StringChr (seqLine, 'L')) ||
106 (StringChr (seqLine, 'l')) ||
107 (StringChr (seqLine, 'P')) ||
108 (StringChr (seqLine, 'p')) ||
109 (StringChr (seqLine, 'Q')) ||
110 (StringChr (seqLine, 'q')) ||
111 /*
112 (StringChr (seqLine, 'U')) ||
113 (StringChr (seqLine, 'u')) ||
114 */
115 (StringChr (seqLine, 'Z')) ||
116 (StringChr (seqLine, 'z')) ||
117 (StringChr (seqLine, '*')))
118 return ALI_PROTEIN;
119
120 /* All others are technically ambiguous, but */
121 /* if we have a high enough percentage of */
122 /* common nucleotides, then it is probably a */
123 /* nucleotide sequence. */
124
125 nuclCount = 0;
126 miscCount = 0;
127 sprintf (commonNucls, "ATCGNXatcgnx");
128
129 if (configPtr->unalignedChar != NULL)
130 sprintf (miscChars, "-%s%s%s ", configPtr->gapChar,
131 configPtr->missingChar, configPtr->unalignedChar);
132 else
133 sprintf (miscChars, "-%s%s ", configPtr->gapChar,
134 configPtr->missingChar);
135
136 for (position = 0; seqLine[position] != '\0'; position++) {
137 if (StringChr (commonNucls, seqLine[position]) != NULL)
138 nuclCount++;
139 else if (StringChr (miscChars, seqLine[position]) != NULL)
140 miscCount++;
141 }
142
143 /* If we have a high percentage of misc chars then */
144 /* we don't have enough data to make a decision. */
145
146 percentMisc = ((FloatLo) miscCount) / ((FloatLo) StringLen (seqLine));
147 if ((percentMisc * 100) > 80)
148 return ALI_AMBIGUOUS;
149
150 /* Else, if a high percentage are common nucleotide */
151 /* characters then it is a nucleotide line. */
152
153 percentNucl = ((FloatLo) nuclCount + (FloatLo) miscCount) /
154 (FloatLo) StringLen (seqLine);
155
156 if ((percentNucl * 100) > configPtr->nuclLineMaxThreshold)
157 return ALI_NUCLEOTIDE;
158 else if ((percentNucl * 100) < configPtr->nuclLineMinThreshold)
159 return ALI_PROTEIN;
160
161 /* If we haven't come to a conclusion */
162 /* then say so. */
163
164 return ALI_AMBIGUOUS;
165 }
166
167 /*=========================================================================*/
168 /* */
169 /* IsSequenceChar () */
170 /* */
171 /*=========================================================================*/
172
173 Boolean IsSequenceChar (Char ch,
174 CharPtr gapChar,
175 CharPtr missingChar,
176 CharPtr unalignedChar)
177 {
178
179
180 if (IsNucleotideChar(ch))
181 return TRUE;
182
183 if (IsProteinChar(ch))
184 return TRUE;
185
186 if (StrChr (gapChar, ch) != NULL)
187 return TRUE;
188
189 if (StrChr (missingChar, ch) != NULL)
190 return TRUE;
191
192 if ((unalignedChar != NULL) &&
193 (StrChr (unalignedChar, ch) != NULL))
194 return TRUE;
195
196 return FALSE;
197 }
198
199 /*=========================================================================*/
200 /* */
201 /* IsValidIdChar () */
202 /* */
203 /*=========================================================================*/
204
205 #define ID_BAD_CHAR 0
206 #define ID_GOOD_CHAR_LETTER 1
207 #define ID_GOOD_CHAR_NUMBER 2
208 #define ID_GOOD_CHAR_OTHER 3
209
210 Int2 IsValidIdChar (Char idChar)
211 {
212 if (StringChr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",idChar) != NULL)
213 return ID_GOOD_CHAR_LETTER;
214
215 if (StringChr("abcdefghijklmnopqrstuvwxyz",idChar) != NULL)
216 return ID_GOOD_CHAR_LETTER;
217
218 if (StringChr("0123456789",idChar) != NULL)
219 return ID_GOOD_CHAR_NUMBER;
220
221 if (StringChr("\"._-|",idChar) != NULL)
222 return ID_GOOD_CHAR_OTHER;
223
224 return FALSE;
225 }
226
227 /*=========================================================================*/
228 /* */
229 /* IsValidId () */
230 /* */
231 /*=========================================================================*/
232
233 Boolean IsValidId (CharPtr idStr)
234 {
235 Int4 position;
236 Boolean letterFound = FALSE;
237 Int2 charType;
238
239 /* Check for illegal characters */
240
241 for (position = 0; idStr[position] != '\0'; position++)
242 {
243 charType = IsValidIdChar(idStr[position]);
244 switch (charType)
245 {
246 case ID_GOOD_CHAR_LETTER :
247 letterFound = TRUE;
248 break;
249 case ID_GOOD_CHAR_NUMBER :
250 case ID_GOOD_CHAR_OTHER :
251 break;
252 default:
253 return FALSE;
254 }
255 }
256
257 if (!letterFound)
258 return FALSE;
259
260 /* Check to see if ID matches a reserved word */
261
262 if (StringICmp (idStr,"MATRIX") == 0)
263 return FALSE;
264 else if (StringICmp (idStr,"BEGIN") == 0)
265 return FALSE;
266 else if (StringICmp (idStr, "END") == 0)
267 return FALSE;
268
269 /* If we passed all tests, mark */
270 /* it as a valid ID. */
271
272 return TRUE;
273 }
274
275 /*=========================================================================*/
276 /* */
277 /* IsNumString () */
278 /* */
279 /*=========================================================================*/
280
281 Boolean IsNumString (CharPtr someStr)
282 {
283 Int4 position;
284
285 /* Check for non-numeric characters */
286
287 for (position = 0; someStr[position] != '\0'; position++)
288 if (StrChr ("0123456789.", someStr[position]) == NULL)
289 return FALSE;
290
291 /* If we made it to here, all characters are */
292 /* numeric, and so the string is numeric. */
293
294 return TRUE;
295 }
296
297 /*=========================================================================*/
298 /* */
299 /* Ali_ChangeRowToOther () */
300 /* */
301 /*=========================================================================*/
302
303 void Ali_ChangeRowToOther (ValNodePtr rowPtr)
304 {
305 DefLineInfoPtr defLinePtr;
306 SeqLineInfoPtr seqLinePtr;
307 OtherLineInfoPtr otherLinePtr;
308
309 /* Sequence line to Other Line */
310
311 if (rowPtr->choice == ALI_SEQLINE)
312 {
313 otherLinePtr = (OtherLineInfoPtr) MemNew (sizeof(OtherLineInfo));
314 seqLinePtr = (SeqLineInfoPtr) rowPtr->data.ptrvalue;
315
316 if (seqLinePtr->junk != NULL)
317 sprintf(seqLinePtr->sequence,"%s%s",seqLinePtr->sequence,
318 seqLinePtr->junk);
319
320 if ((seqLinePtr->sequence != NULL) && (seqLinePtr->id == NULL))
321 {
322 if (IsValidId(seqLinePtr->sequence))
323 {
324 otherLinePtr->id = seqLinePtr->sequence;
325 otherLinePtr->other = NULL;
326 }
327 else
328 {
329 otherLinePtr->id = NULL;
330 otherLinePtr->other = seqLinePtr->sequence;
331 }
332 }
333 else
334 {
335 otherLinePtr->other = seqLinePtr->sequence;
336 otherLinePtr->id = seqLinePtr->id;
337 }
338
339 otherLinePtr->rowNum = seqLinePtr->rowNum;
340
341 MemFree(seqLinePtr);
342 rowPtr->data.ptrvalue = otherLinePtr;
343 rowPtr->choice = ALI_OTHERLINE;
344 }
345
346
347 /* Definition line to Other line */
348
349 else if (rowPtr->choice == ALI_DEFLINE)
350 {
351 otherLinePtr = (OtherLineInfoPtr) MemNew (sizeof(OtherLineInfo));
352 defLinePtr = (DefLineInfoPtr) rowPtr->data.ptrvalue;
353
354 otherLinePtr->other = defLinePtr->definitions;
355 otherLinePtr->id = defLinePtr->id;
356 otherLinePtr->rowNum = defLinePtr->rowNum;
357
358 MemFree(defLinePtr);
359 rowPtr->data.ptrvalue = otherLinePtr;
360 rowPtr->choice = ALI_OTHERLINE;
361 }
362
363 /* Return successfully */
364
365 return;
366 }
367
368 /*=========================================================================*/
369 /* */
370 /* ReadAlignFileLine() - */
371 /* */
372 /*=========================================================================*/
373
374 CharPtr ReadAlignFileLine (FILE PNTR alignFilePtr,
375 ErrInfoPtr PNTR errorListPtr,
376 AliConfigInfoPtr configPtr,
377 Boolean PNTR isEOF)
378
379 {
380 CharPtr lineStr = NULL;
381 CharPtr tempBuff = NULL;
382 Int4 totalLen = 0;
383 Int4 segmentLen = 0;
384 Int4 segmentCount = 1;
385 Boolean done = FALSE;
386 Char ch = 0;
387
388 /* Allocate memory for the line. More */
389 /* can be added later as necessary. */
390
391 lineStr = (CharPtr) MemNew(sizeof(Char) * configPtr->readBuffSize);
392 if (lineStr == NULL)
393 {
394 Ali_AddError (errorListPtr, ERR_OUT_OF_MEMORY);
395 return NULL;
396 }
397
398 /* Read in the characters one at a time */
399
400 while (!done && !(ch == EOF))
401 {
402
403 /* Process the current character */
404
405 ch = (Char) NLM_GETC (alignFilePtr);
406
407 if (ch == '\n')
408 {
409 done = TRUE;
410 ch = (Char) NLM_GETC (alignFilePtr);
411 if (ch != '\r') {
412 ungetc (ch, alignFilePtr);
413
414 }
415 }
416 else if (ch == '\r') {
417 done = TRUE;
418 }
419 else
420 {
421 lineStr[totalLen] = ch;
422 segmentLen++;
423 totalLen++;
424 }
425
426 /* Allocate more memory for the */
427 /* sequence if needed. */
428
429 if (segmentLen == configPtr->readBuffSize)
430 {
431 segmentCount++;
432 tempBuff = (CharPtr) MemNew(sizeof(Char) *
433 segmentCount *
434 configPtr->readBuffSize);
435 if (tempBuff == NULL)
436 {
437 Ali_AddError (errorListPtr, ERR_OUT_OF_MEMORY);
438 MemFree (lineStr);
439 MemFree (tempBuff);
440 return NULL;
441 }
442 MemCpy(tempBuff, lineStr, (segmentCount-1) * configPtr->readBuffSize);
443 MemFree(lineStr);
444 lineStr = tempBuff;
445 segmentLen = 0;
446 }
447
448 }
449
450 /* Return successfully */
451
452 if (EOF == ch)
453 *isEOF = TRUE;
454
455 lineStr[totalLen] = '\0';
456
457 return lineStr;
458 }
459
460 /*=========================================================================*/
461 /* */
462 /* s_ParseDefLine () - */
463 /* */
464 /*=========================================================================*/
465
466 #define DEFLINE_PRE_DATA 0
467 #define DEFLINE_DEFINITION 1
468 #define DEFLINE_SEQID 2
469
470 static DefLineInfoPtr s_ParseDefLine (CharPtr lineStr,
471 Int4 rowNum,
472 ErrInfoPtr PNTR errorListPtr)
473 {
474 Char ch;
475 CharPtr defStr;
476 CharPtr idStr;
477 Int4 defPosition;
478 Int4 idPosition;
479 Int4 position;
480 Int2 state;
481 DefLineInfoPtr defLinePtr = NULL;
482 ErrInfoPtr errPtr;
483
484 defPosition = 0;
485 idPosition = 0;
486
487 defStr = (CharPtr) MemNew (StringLen(lineStr)+1);
488 idStr = (CharPtr) MemNew (StringLen(lineStr)+1);
489
490 /* Parse the line character by character */
491
492 state = DEFLINE_PRE_DATA;
493
494 for (position = 0; lineStr[position] != '\0'; position++)
495 {
496 ch = lineStr[position];
497
498 switch (state)
499 {
500 case DEFLINE_PRE_DATA :
501 if (IS_WHITESP(ch))
502 continue;
503 else if (ch == '>')
504 state = DEFLINE_SEQID;
505 else
506 {
507 MemFree(defStr);
508 MemFree(idStr);
509 return NULL; /* Not a defline */
510 }
511 break;
512 case DEFLINE_SEQID :
513 if (IsValidIdChar(ch))
514 {
515 idStr[idPosition] = ch;
516 idPosition++;
517 }
518 else if (IS_WHITESP(ch))
519 {
520 if (idPosition > 0)
521 {
522 state = DEFLINE_DEFINITION;
523 defStr[defPosition] = ch;
524 defPosition++;
525 }
526 else
527 continue;
528 }
529 else if (ch == '[')
530 {
531 state = DEFLINE_DEFINITION;
532 defStr[defPosition] = ch;
533 defPosition++;
534 }
535 else
536 {
537 errPtr = Ali_AddError (errorListPtr, ERR_INVALID_DEFLINE,
538 lineStr, (Int4) ch);
539 errPtr->rowNum = rowNum;
540 MemFree(defStr);
541 MemFree(idStr);
542 return NULL;
543 }
544 break;
545 case DEFLINE_DEFINITION :
546 defStr[defPosition] = ch;
547 defPosition++;
548 break;
549 default:
550 break;
551 }
552 }
553
554 /* Check for blank line */
555
556 if (state == DEFLINE_PRE_DATA)
557 {
558 MemFree(defStr);
559 MemFree(idStr);
560 return NULL;
561 }
562
563 idStr[idPosition] = '\0';
564 defStr[defPosition] = '\0';
565
566 /* Make sure that it has at least one */
567 /* set of square brackets. */
568
569 if ((StringChr(defStr,'[') == NULL) || (StringChr(defStr,']') == NULL))
570 {
571 errPtr = Ali_AddError (errorListPtr, ERR_DEFLINE_NODEFS, lineStr);
572 errPtr->rowNum = rowNum;
573 }
574
575 /* If we made it to here, then */
576 /* it's a valid definition line. */
577
578 defLinePtr = (DefLineInfoPtr) MemNew (sizeof (DefLineInfo));
579
580 if (StringLen (defStr) != 0)
581 defLinePtr->definitions = defStr;
582 else
583 defLinePtr->definitions = NULL;
584
585 if (StringLen (idStr) != 0)
586 defLinePtr->id = idStr;
587 else
588 defLinePtr->id = NULL;
589
590 return defLinePtr;
591 }
592
593 /*=========================================================================*/
594 /* */
595 /* s_MightBeCorruptSequence () */
596 /* */
597 /*=========================================================================*/
598
599 static Boolean s_MightBeCorruptSequence (Int4 seqCharCount,
600 CharPtr seqString,
601 AliConfigInfoPtr configPtr)
602 {
603 Int4 i;
604 Int4 badCharCount;
605 Int4 seqStrLen;
606 FloatLo percentGood;
607
608 seqStrLen = StringLen(seqString);
609 badCharCount = 0;
610
611 for (i = 0; i < seqStrLen; i++)
612 {
613 if (IsSequenceChar(seqString[i],
614 configPtr->gapChar,
615 configPtr->missingChar,
616 configPtr->unalignedChar))
617 seqCharCount++;
618 else
619 badCharCount++;
620 }
621
622 percentGood = (FloatLo) seqCharCount / ((FloatLo) seqCharCount +
623 (FloatLo) badCharCount);
624
625 if ((percentGood * 100) >= configPtr->corruptSeqThreshold)
626 return TRUE;
627 else
628 return FALSE;
629 }
630
631 /*=========================================================================*/
632 /* */
633 /* s_ParseSequenceLine () - */
634 /* */
635 /*=========================================================================*/
636
637 #define PRE_DATA 0
638 #define FIRST_WORD 1
639 #define SEQUENCE_DATA 2
640 #define EOL_JUNK 3
641 #define POST_JUNK 4
642
643 static SeqLineInfoPtr s_ParseSequenceLine (CharPtr lineStr,
644 AliConfigInfoPtr configPtr)
645 {
646 CharPtr seqStr;
647 Int4 seqPosition = 0;
648 CharPtr idStr;
649 Int4 idPosition = 0;
650 Int4 firstWordLen = 0;
651 Char ch;
652 Int2 state = PRE_DATA;
653 Int4 position;
654 Boolean firstWordNotSequence = FALSE;
655 Boolean sequenceFound = FALSE;
656 CharPtr tempStr;
657 Boolean corruptSequence = FALSE;
658 SeqLineInfoPtr seqLinePtr;
659
660 if (StringLen(lineStr) == 0)
661 return NULL;
662
663 seqStr = (CharPtr) MemNew (StringLen(lineStr)+1);
664 idStr = (CharPtr) MemNew (StringLen(lineStr)+1);
665
666 for (position = 0; lineStr[position] != '\0'; position++)
667 {
668 ch = lineStr[position];
669
670 switch (state)
671 {
672 case PRE_DATA :
673
674 /* If it's the first non-whitespace char */
675 /* then we've found our first word. */
676
677 if (!IS_WHITESP(ch))
678 {
679 state = FIRST_WORD;
680 if (!IsSequenceChar(ch,
681 configPtr->gapChar,
682 configPtr->missingChar,
683 configPtr->unalignedChar))
684 firstWordNotSequence = TRUE;
685 idStr[idPosition] = ch;
686 idPosition++;
687 firstWordLen++;
688 }
689 break;
690 case FIRST_WORD :
691 if (IS_WHITESP(ch))
692 {
693 state = SEQUENCE_DATA;
694 if ((idPosition > 0) &&
695 (firstWordNotSequence == FALSE))
696 {
697 tempStr = seqStr;
698 seqStr = idStr;
699 idStr = tempStr;
700 seqPosition = idPosition;
701 idPosition = 0;
702 sequenceFound = TRUE;
703 }
704 }
705 else
706 {
707 /* If we find a non-sequence char in the */
708 /* first word then it might be an ID, */
709 /* with the sequence following. */
710
711 if (!IsSequenceChar(ch,
712 configPtr->gapChar,
713 configPtr->missingChar,
714 configPtr->unalignedChar))
715 firstWordNotSequence = TRUE;
716 idStr[idPosition] = ch;
717 idPosition++;
718 firstWordLen++;
719 }
720 break;
721 case SEQUENCE_DATA :
722 if (IS_WHITESP(ch))
723 continue;
724
725 /* If we're in a sequence, then a non-sequence */
726 /* char invalidates it, although we do allow */
727 /* 'junk' at the end. */
728
729 if (!IsSequenceChar(ch,
730 configPtr->gapChar,
731 configPtr->missingChar,
732 configPtr->unalignedChar))
733 {
734 if ((lineStr[position - 1] == ' ') && sequenceFound)
735 state = EOL_JUNK;
736 else if ((corruptSequence == TRUE) ||
737 (s_MightBeCorruptSequence (seqPosition,
738 &(lineStr[position]),
739 configPtr)))
740 {
741 seqStr[seqPosition] = ch;
742 seqPosition++;
743 sequenceFound = TRUE;
744 corruptSequence = TRUE;
745 }
746 else
747 {
748 MemFree(seqStr);
749 MemFree(idStr);
750 return NULL;
751 }
752 }
753 else
754 {
755 seqStr[seqPosition] = ch;
756 seqPosition++;
757 sequenceFound = TRUE;
758 }
759 break;
760 case EOL_JUNK :
761 if (IS_WHITESP(ch))
762 state = POST_JUNK;
763 break;
764 case POST_JUNK :
765
766 /* Only one 'word' of junk allowed */
767
768 if (!IS_WHITESP(ch))
769 {
770 MemFree(seqStr);
771 MemFree(idStr);
772 return NULL;
773 }
774 break;
775 }
776 }
777
778 /* Check for blank line */
779
780 if (state == PRE_DATA)
781 {
782 MemFree(seqStr);
783 MemFree(idStr);
784 return NULL;
785 }
786
787 if (state == FIRST_WORD)
788 {
789
790 /* If there was just one word, and it isn't */
791 /* a sequence string, then this isn't a */
792 /* sequence line. */
793
794 if (firstWordNotSequence == TRUE)
795 {
796 MemFree(seqStr);
797 MemFree(idStr);
798 return NULL;
799 }
800
801 /* If there was just one word, and it IS a sequence */
802 /* then the idStr is actually the seqStr. */
803
804 else
805 {
806 tempStr = seqStr;
807 seqStr = idStr;
808 idStr = tempStr;
809 seqPosition = idPosition;
810 idPosition = 0;
811 }
812 }
813
814 /* If still no sequence string, */
815 /* then not a sequence line. */
816
817 if (StringLen(seqStr) == 0)
818 {
819 MemFree(seqStr);
820 MemFree(idStr);
821 return NULL;
822 }
823
824 /* Check to see if the ID is a valid one */
825
826 idStr[idPosition] = '\0';
827 seqStr[seqPosition] = '\0';
828
829 if ((idPosition > 0) &&
830 (IsValidId (idStr) == FALSE) &&
831 (IsNumString (idStr) == FALSE))
832 {
833 MemFree(idStr);
834 MemFree(seqStr);
835 return NULL;
836 }
837
838 /* If we made it to here, then */
839 /* it's a valid sequence line. */
840
841 seqLinePtr = (SeqLineInfoPtr) MemNew (sizeof (SeqLineInfo));
842
843 seqLinePtr->firstWordLen = firstWordLen;
844
845 if (StringLen (seqStr) != 0)
846 seqLinePtr->sequence = seqStr;
847 else
848 {
849 seqLinePtr->sequence = NULL;
850 MemFree (seqStr);
851 }
852
853 if (StringLen (idStr) != 0)
854 seqLinePtr->id = idStr;
855 else
856 {
857 seqLinePtr->id = NULL;
858 MemFree (idStr);
859 }
860
861 if (corruptSequence)
862 seqLinePtr->maybe = TRUE;
863 else
864 seqLinePtr->maybe = FALSE;
865
866 return seqLinePtr;
867 }
868
869 /*=========================================================================*/
870 /* */
871 /* s_ParseOtherLine () - */
872 /* */
873 /*=========================================================================*/
874
875 #define OTHER_PRE_DATA 0
876 #define OTHER_DATA 1
877
878 static OtherLineInfoPtr s_ParseOtherLine (CharPtr lineStr)
879 {
880 Char ch;
881 CharPtr otherStr;
882 Int4 otherPosition;
883 Int4 position;
884 Int2 state;
885 Int4 wordCount;
886 OtherLineInfoPtr otherLinePtr;
887
888 /* Parse the line character by character */
889
890 otherStr = (CharPtr) MemNew (StringLen(lineStr)+1);
891 otherPosition = 0;
892 state = OTHER_PRE_DATA;
893 wordCount = 0;
894
895 for (position = 0; lineStr[position] != '\0'; position++)
896 {
897 ch = lineStr[position];
898
899 switch (state)
900 {
901 case OTHER_PRE_DATA :
902 if (IS_WHITESP(ch))
903 continue;
904 else
905 {
906 wordCount = 1;
907 state = OTHER_DATA;
908 otherStr[otherPosition] = ch;
909 otherPosition++;
910 }
911 break;
912 case OTHER_DATA :
913 if (IS_WHITESP(ch))
914 wordCount++;
915 otherStr[otherPosition] = ch;
916 otherPosition++;
917 break;
918 default:
919 break;
920 }
921 }
922
923 /* Check for blank line */
924
925 if (state == OTHER_PRE_DATA)
926 {
927 MemFree(otherStr);
928 return NULL;
929 }
930
931 /* If we made it to here, then */
932 /* it's a valid definition line. */
933
934 otherStr[otherPosition] = '\0';
935
936 otherLinePtr = (OtherLineInfoPtr) MemNew (sizeof (OtherLineInfo));
937 if ((wordCount == 1) && IsValidId(otherStr))
938 {
939 otherLinePtr->id = otherStr;
940 otherLinePtr->other = NULL;
941 }
942 else
943 {
944 otherLinePtr->id = NULL;
945 otherLinePtr->other = otherStr;
946 }
947
948 /* Return successfully */
949
950 return otherLinePtr;
951 }
952
953 /*=========================================================================*/
954 /* */
955 /* s_OtherGetValue () */
956 /* */
957 /*=========================================================================*/
958
959 CharPtr s_OtherGetValue (CharPtr otherStr)
960 {
961 CharPtr tempStrPtr;
962 CharPtr valueBuff;
963 Int2 charCount;
964
965 /* Go to the first character after the '=' */
966
967 if ((tempStrPtr = StringChr (otherStr, '=')) == NULL)
968 return NULL;
969 tempStrPtr++;
970
971 /* Skip spaces */
972
973 while (*tempStrPtr == ' ')
974 tempStrPtr++;
975
976 if (*tempStrPtr == '\0')
977 return NULL;
978
979 /* Get the value */
980
981 valueBuff = (CharPtr) MemNew (ALI_MAX_LINE_LEN);
982 charCount = 0;
983
984 while ((*tempStrPtr != ' ') && (*tempStrPtr != '\0'))
985 {
986 valueBuff[charCount] = *tempStrPtr;
987 charCount++;
988 tempStrPtr++;
989 }
990
991 valueBuff[charCount] = '\0';
992
993 /* Return successfully */
994
995 return valueBuff;
996 }
997
998 /*=========================================================================*/
999 /* */
1000 /* s_ProcessOtherLine () */
1001 /* */
1002 /*=========================================================================*/
1003
1004 static Boolean s_ProcessOtherLine (AliConfigInfoPtr configPtr,
1005 CharPtr otherStr,
1006 AlignFileDataPtr fileInfoPtr)
1007 {
1008 CharPtr strPtr;
1009 CharPtr tmpStr;
1010
1011 /* Check for datatype declaration */
1012
1013 if (((strPtr = StringStr (otherStr, "datatype")) != NULL) ||
1014 ((strPtr = StringStr (otherStr, "DATATYPE")) != NULL) ||
1015 ((strPtr = StringStr (otherStr, "Datatype")) != NULL))
1016 {
1017 tmpStr = s_OtherGetValue(strPtr);
1018 if (StringICmp (tmpStr, "DNA") == 0)
1019 configPtr->declaredInfo.dataType = ALI_DATA_NUCLEOTIDE;
1020 else
1021 configPtr->declaredInfo.dataType = ALI_DATA_PROTEIN;
1022 MemFree (tmpStr);
1023 }
1024
1025 /* Check for interleaved/contiguous */
1026
1027 if (((strPtr = StringStr (otherStr, "INTERLEAVED")) != NULL) ||
1028 ((strPtr = StringStr (otherStr, "interleaved")) != NULL) ||
1029 ((strPtr = StringStr (otherStr, "Interleaved")) != NULL))
1030 configPtr->declaredInfo.contigOrInter = ALI_INTERLEAVED;
1031 else if (((strPtr = StringStr (otherStr, "CONTIGUOUS")) != NULL) ||
1032 ((strPtr = StringStr (otherStr, "contiguous")) != NULL) ||
1033 ((strPtr = StringStr (otherStr, "Contiguous")) != NULL))
1034 configPtr->declaredInfo.contigOrInter = ALI_CONTIGUOUS;
1035
1036 /* Check for dimensions */
1037
1038 if (((strPtr = StringStr (otherStr, "NTAX")) != NULL) ||
1039 ((strPtr = StringStr (otherStr, "ntax")) != NULL) ||
1040 ((strPtr = StringStr (otherStr, "nTax")) != NULL))
1041 {
1042 tmpStr = s_OtherGetValue (strPtr);
1043 configPtr->declaredInfo.idCount = atoi(tmpStr);
1044 MemFree (tmpStr);
1045 }
1046
1047 if (((strPtr = StringStr (otherStr, "NCHAR")) != NULL) ||
1048 ((strPtr = StringStr (otherStr, "nchar")) != NULL) ||
1049 ((strPtr = StringStr (otherStr, "nChar")) != NULL))
1050 {
1051 tmpStr = s_OtherGetValue (strPtr);
1052 configPtr->declaredInfo.seqLength = atoi(tmpStr);
1053 MemFree (tmpStr);
1054 }
1055
1056 /* Check for definition of missing character */
1057
1058 if (((strPtr = StringStr (otherStr, "MISSING")) != NULL) ||
1059 ((strPtr = StringStr (otherStr, "missing")) != NULL) ||
1060 ((strPtr = StringStr (otherStr, "Missing")) != NULL))
1061 {
1062 tmpStr = s_OtherGetValue(strPtr);
1063 configPtr->missingChar = (CharPtr) MemNew (2);
1064 sprintf (configPtr->missingChar, "%c", tmpStr[0]);
1065 MemFree (tmpStr);
1066
1067 /* If the new missing char conflicts with the */
1068 /* gap or unaligned char, then blank them out */
1069 /* to give the new one precedence. */
1070
1071 if (StringICmp (configPtr->missingChar, configPtr->gapChar) == 0)
1072 StringCpy (configPtr->gapChar, "");
1073
1074 if (StringICmp (configPtr->missingChar, configPtr->unalignedChar) == 0)
1075 StringCpy (configPtr->unalignedChar, "");
1076
1077 }
1078
1079 /* Check for definition of gap character */
1080
1081 if (((strPtr = StringStr (otherStr, "GAP")) != NULL) ||
1082 ((strPtr = StringStr (otherStr, "gap")) != NULL) ||
1083 ((strPtr = StringStr (otherStr, "Gap")) != NULL))
1084 {
1085 tmpStr = s_OtherGetValue(strPtr);
1086 configPtr->gapChar = (CharPtr) MemNew (2);
1087 sprintf (configPtr->gapChar, "%c", tmpStr[0]);
1088 MemFree (tmpStr);
1089
1090 /* If the new gap char conflicts with the missing */
1091 /* or unaligned char, then blank them out to give */
1092 /* the new one precedence. */
1093
1094 if (StringICmp (configPtr->gapChar, configPtr->missingChar) == 0)
1095 StringCpy (configPtr->missingChar, "");
1096
1097 if (StringICmp (configPtr->gapChar, configPtr->unalignedChar) == 0)
1098 StringCpy (configPtr->unalignedChar, "");
1099
1100 }
1101
1102 /* Check for definition of unaligned character */
1103
1104 if (((strPtr = StringStr (otherStr, "UNALIGNED")) != NULL) ||
1105 ((strPtr = StringStr (otherStr, "unaligned")) != NULL) ||
1106 ((strPtr = StringStr (otherStr, "Unaligned")) != NULL))
1107 {
1108 tmpStr = s_OtherGetValue(strPtr);
1109 configPtr->unalignedChar = (CharPtr) MemNew (2);
1110 sprintf (configPtr->unalignedChar, "%c", tmpStr[0]);
1111 MemFree (tmpStr);
1112
1113 /* If the new unaligned char conflicts with the */
1114 /* gap or missing char, then blank them out to */
1115 /* give the new one precedence. */
1116
1117 if (StringICmp (configPtr->unalignedChar, configPtr->gapChar) == 0)
1118 StringCpy (configPtr->gapChar, "");
1119
1120 if (StringICmp (configPtr->unalignedChar, configPtr->missingChar) == 0)
1121 StringCpy (configPtr->missingChar, "");
1122
1123 }
1124
1125 /* Return successfully */
1126
1127 return TRUE;
1128 }
1129
1130 /*=========================================================================*/
1131 /* */
1132 /* Ali_ReadLines () */
1133 /* */
1134 /*=========================================================================*/
1135
1136 ValNodePtr Ali_ReadLines (FILE PNTR alignFilePtr,
1137 ErrInfoPtr PNTR errorListPtr,
1138 AliConfigInfoPtr configPtr,
1139 AlignFileDataPtr fileInfoPtr)
1140 {
1141 CharPtr lineStr = NULL;
1142 ValNodePtr rowList = NULL;
1143 ValNodePtr newRow;
1144 SeqLineInfoPtr seqLine;
1145 SeqLineInfoPtr reEvalSeqPtr;
1146 DefLineInfoPtr defLine;
1147 OtherLineInfoPtr otherLine;
1148 Boolean nextRowMustBeSeq;
1149 Boolean idFound;
1150 Boolean lastRowWasOther = FALSE;
1151 Int4 rowNum;
1152 ErrInfoPtr errPtr;
1153 Boolean isEOF;
1154
1155 nextRowMustBeSeq = FALSE;
1156 rowNum = 0;
1157 isEOF = FALSE;
1158
1159 while (FALSE == isEOF)
1160 {
1161
1162 /* Process the line according to its content ... */
1163
1164 lineStr = ReadAlignFileLine(alignFilePtr, errorListPtr,
1165 configPtr, &isEOF);
1166 if (lineStr == NULL) {
1167 return NULL;
1168 }
1169
1170 rowNum++;
1171
1172 /* ... DefLine */
1173
1174 if ((defLine = s_ParseDefLine(lineStr, rowNum, errorListPtr)) != NULL)
1175 {
1176 defLine->rowNum = rowNum;
1177 lastRowWasOther = FALSE;
1178 if (nextRowMustBeSeq)
1179 nextRowMustBeSeq = FALSE;
1180
1181 /* If we found an ID, then the next */
1182 /* row must have a sequence. */
1183
1184 if ((defLine->id != NULL) && (StringLen(defLine->id) != 0))
1185 nextRowMustBeSeq = TRUE;
1186 else
1187 nextRowMustBeSeq = FALSE;
1188
1189 /* Add a record for the defline */
1190
1191 newRow = ValNodeAdd(&rowList);
1192 if (NULL == newRow)
1193 {
1194 errPtr = Ali_AddError (errorListPtr, ERR_OUT_OF_MEMORY);
1195 errPtr->rowNum = rowNum;
1196 return NULL;
1197 }
1198
1199 newRow->choice = ALI_DEFLINE;
1200 newRow->data.ptrvalue = defLine;
1201
1202 }
1203
1204 /* ... Sequence Data */
1205
1206 else if ((seqLine = s_ParseSequenceLine(lineStr, configPtr))
1207 != NULL)
1208 {
1209 seqLine->rowNum = rowNum;
1210
1211 /* Is it a Nucleotide sequence or a Protein sequence? */
1212
1213 seqLine->type = Ali_SeqLineGetType(seqLine->sequence, configPtr);
1214
1215 /* Add a record for the sequence */
1216
1217 newRow = ValNodeAdd(&rowList);
1218 if (NULL == newRow)
1219 {
1220 errPtr = Ali_AddError (errorListPtr, ERR_OUT_OF_MEMORY);
1221 errPtr->rowNum = rowNum;
1222 return NULL;
1223 }
1224
1225 newRow->data.ptrvalue = seqLine;
1226
1227 /* Mark it as a sequence line */
1228
1229 if ((seqLine->maybe == FALSE) ||
1230 ((seqLine->maybe == TRUE) && (configPtr->useMaybes == TRUE)))
1231 {
1232
1233 if (StringLen(seqLine->id) != 0)
1234 lastRowWasOther = FALSE;
1235
1236 newRow->choice = ALI_SEQLINE;
1237
1238 if (nextRowMustBeSeq)
1239 nextRowMustBeSeq = FALSE;
1240
1241 /* A sequence must follow either a defline, */
1242 /* an ID, or another sequence. */
1243
1244 if (lastRowWasOther == TRUE)
1245 {
1246 reEvalSeqPtr = SeqLineReEval (seqLine);
1247 if (NULL == reEvalSeqPtr)
1248 Ali_ChangeRowToOther(newRow);
1249 else
1250 newRow->data.ptrvalue = reEvalSeqPtr;
1251 }
1252 }
1253 else /* A 'maybe' sequence that we're not using */
1254 {
1255 Ali_ChangeRowToOther(newRow);
1256 lastRowWasOther = TRUE;
1257 }
1258 }
1259
1260 /* ... Other */
1261
1262 else
1263 {
1264 if (StringLen(lineStr) > 0)
1265 {
1266 if ((otherLine = s_ParseOtherLine(lineStr)) != NULL)
1267 {
1268 otherLine->rowNum = rowNum;
1269 if (otherLine->id != NULL)
1270 {
1271 idFound = TRUE;
1272 lastRowWasOther = FALSE;
1273 }
1274 else
1275 {
1276 idFound = FALSE;
1277 lastRowWasOther = TRUE;
1278 }
1279
1280 newRow = ValNodeAdd(&rowList);
1281 if (NULL == newRow)
1282 {
1283 errPtr = Ali_AddError (errorListPtr,
1284 ERR_OUT_OF_MEMORY);
1285 errPtr->rowNum = rowNum;
1286 return NULL;
1287 }
1288
1289 newRow->choice = ALI_OTHERLINE;
1290 newRow->data.ptrvalue = otherLine;
1291
1292 /* If the next row needs to be a Sequence, */
1293 /* and we're not still on the same row, */
1294 /* then change the previous ID to other. */
1295
1296 if (nextRowMustBeSeq && !idFound)
1297 nextRowMustBeSeq = FALSE;
1298
1299 if (idFound)
1300 nextRowMustBeSeq = TRUE;
1301 else
1302 nextRowMustBeSeq = FALSE;
1303
1304 /* Attempt to parse any configuration */
1305 /* information from the line. */
1306
1307 if (otherLine->other != NULL)
1308 s_ProcessOtherLine (configPtr, otherLine->other, fileInfoPtr);
1309
1310 }
1311 }
1312 }
1313 MemFree (lineStr);
1314 }
1315
1316 return rowList;
1317 }
1318 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |