NCBI C Toolkit Cross Reference

C/api/aliread.c


  1 /* Include files */
  2 
  3 #include <ncbi.h>
  4 #include <objalign.h>
  5 
  6 #include <aliparse.h>
  7 
  8 #ifdef OS_UNIX_DARWIN
  9 #define NLM_GETC fgetc
 10 #else
 11 #define NLM_GETC getc
 12 #endif
 13 
 14 /* Defined constants */
 15 
 16 #define ALI_DATA_NUCLEOTIDE  1      /* Values for dataType   */
 17 #define ALI_DATA_PROTEIN     2      /* field in DataInfo.    */
 18 #define ALI_MAX_LINE_LEN     256
 19 
 20 /* Function prototypes */
 21 
 22 Boolean     IsNucleotideChar (Char ch);
 23 Boolean     IsProteinChar (Char ch);
 24 Boolean     IsSequenceChar (Char    ch,
 25                             CharPtr gapChar,
 26                             CharPtr missingChar,
 27                             CharPtr unalignedChar);
 28 Int2        IsValidIdChar (Char idChar);
 29 Boolean     IsValidId (CharPtr idStr);
 30 CharPtr     ReadAlignFileLine (FILE PNTR        alignFilePtr,
 31                                ErrInfoPtr PNTR  errorListPtr,
 32                                AliConfigInfoPtr configPtr,
 33                                Boolean    PNTR  isEOF);
 34 
 35 
 36 static Boolean          s_MightBeCorruptSequence (Int4             seqCharCount,
 37                                                   CharPtr          seqString,
 38                                                   AliConfigInfoPtr configPtr);
 39 static DefLineInfoPtr   s_ParseDefLine (CharPtr          lineStr,
 40                                         Int4             rowNum,
 41                                         ErrInfoPtr PNTR  errorListPtr);
 42 static SeqLineInfoPtr   s_ParseSequenceLine (CharPtr lineStr,
 43                                              AliConfigInfoPtr configPtr);
 44 CharPtr                 s_OtherGetValue (CharPtr otherStr);
 45 static Boolean          s_ProcessOtherLine (AliConfigInfoPtr configPtr,
 46                                             CharPtr          otherStr,
 47                                             AlignFileDataPtr fileInfoPtr);
 48 static OtherLineInfoPtr s_ParseOtherLine (CharPtr lineStr);
 49 
 50 /*=========================================================================*/
 51 /*                                                                         */
 52 /*  IsNucleotideChar ()                                                    */
 53 /*                                                                         */
 54 /*=========================================================================*/
 55 
 56 Boolean IsNucleotideChar (Char ch)
 57 {
 58   if (StringChr("abcdghkmnrstuvwxyABCDGHKMNRSTUVWXY",ch) != NULL)
 59     return TRUE;
 60   else
 61     return FALSE;
 62 }
 63 
 64 /*=========================================================================*/
 65 /*                                                                         */
 66 /*  IsProteinChar ()                                                       */
 67 /*                                                                         */
 68 /*=========================================================================*/
 69 
 70 Boolean IsProteinChar (Char ch)
 71 {
 72   if (StringChr("ABCDEFGHIKLMNPQRSTUVWXYZ*abcdefghiklmnpqrstuvwxyz",ch) != NULL)
 73     return TRUE;
 74   else
 75     return FALSE;
 76 }
 77 
 78 /*=========================================================================*/
 79 /*                                                                         */
 80 /* Ali_SeqLineGetType ()                                                   */
 81 /*                                                                         */
 82 /*=========================================================================*/
 83 
 84 Int2 Ali_SeqLineGetType(CharPtr seqLine,
 85                         AliConfigInfoPtr configPtr)
 86 {
 87   Int4    position;
 88   Int4    nuclCount;
 89   Int4    miscCount;
 90   FloatLo percentNucl;
 91   FloatLo percentMisc;
 92   Char    commonNucls[20];
 93   Char    miscChars[5];
 94 
 95   /* Is it definitely a protein sequence? */
 96   /* The following chars are only in      */
 97   /* protein sequences.                   */
 98 
 99   if ((StringChr (seqLine, 'E')) ||
100       (StringChr (seqLine, 'e')) ||
101       (StringChr (seqLine, 'F')) ||
102       (StringChr (seqLine, 'f')) ||
103       (StringChr (seqLine, 'I')) ||
104       (StringChr (seqLine, 'i')) ||
105       (StringChr (seqLine, 'L')) ||
106       (StringChr (seqLine, 'l')) ||
107       (StringChr (seqLine, 'P')) ||
108       (StringChr (seqLine, 'p')) ||
109       (StringChr (seqLine, 'Q')) ||
110       (StringChr (seqLine, 'q')) ||
111       /*
112       (StringChr (seqLine, 'U')) ||
113       (StringChr (seqLine, 'u')) ||
114       */
115       (StringChr (seqLine, 'Z')) ||
116       (StringChr (seqLine, 'z')) ||
117       (StringChr (seqLine, '*')))
118     return ALI_PROTEIN;
119 
120   /* All others are technically ambiguous, but */
121   /* if we have a high enough percentage of    */
122   /* common nucleotides, then it is probably a */
123   /* nucleotide sequence.                      */
124   
125   nuclCount = 0;
126   miscCount = 0;
127   sprintf (commonNucls, "ATCGNXatcgnx");
128 
129   if (configPtr->unalignedChar != NULL)
130     sprintf (miscChars, "-%s%s%s ", configPtr->gapChar,
131              configPtr->missingChar, configPtr->unalignedChar);
132   else
133     sprintf (miscChars, "-%s%s ", configPtr->gapChar,
134              configPtr->missingChar);
135 
136   for (position = 0; seqLine[position] != '\0'; position++) {
137     if (StringChr (commonNucls, seqLine[position]) != NULL)
138       nuclCount++;
139     else if (StringChr (miscChars, seqLine[position]) != NULL)
140       miscCount++;
141   }
142 
143   /* If we have a high percentage of misc chars then */
144   /* we don't have enough data to make a decision.   */
145 
146   percentMisc = ((FloatLo) miscCount) / ((FloatLo) StringLen (seqLine));
147   if ((percentMisc * 100) > 80)
148     return ALI_AMBIGUOUS;
149 
150   /* Else, if a high percentage are common nucleotide */
151   /* characters then it is a nucleotide line.         */
152 
153   percentNucl = ((FloatLo) nuclCount + (FloatLo) miscCount) / 
154                 (FloatLo) StringLen (seqLine);
155 
156   if ((percentNucl * 100) > configPtr->nuclLineMaxThreshold)
157     return ALI_NUCLEOTIDE;
158   else if ((percentNucl * 100) < configPtr->nuclLineMinThreshold)
159     return ALI_PROTEIN;
160 
161   /* If we haven't come to a conclusion */
162   /* then say so.                       */
163 
164   return ALI_AMBIGUOUS;
165 }
166 
167 /*=========================================================================*/
168 /*                                                                         */
169 /*  IsSequenceChar ()                                                      */
170 /*                                                                         */
171 /*=========================================================================*/
172 
173 Boolean IsSequenceChar (Char    ch,
174                         CharPtr gapChar,
175                         CharPtr missingChar,
176                         CharPtr unalignedChar)
177 {
178 
179   
180   if (IsNucleotideChar(ch))
181     return TRUE;
182 
183   if (IsProteinChar(ch))
184     return TRUE;
185 
186   if (StrChr (gapChar, ch) != NULL)
187     return TRUE;
188 
189   if (StrChr (missingChar, ch) != NULL)
190     return TRUE;
191 
192   if ((unalignedChar != NULL) &&
193       (StrChr (unalignedChar, ch) != NULL))
194     return TRUE;
195 
196   return FALSE;
197 }
198 
199 /*=========================================================================*/
200 /*                                                                         */
201 /* IsValidIdChar ()                                                        */
202 /*                                                                         */
203 /*=========================================================================*/
204 
205 #define ID_BAD_CHAR         0
206 #define ID_GOOD_CHAR_LETTER 1
207 #define ID_GOOD_CHAR_NUMBER 2
208 #define ID_GOOD_CHAR_OTHER  3
209 
210 Int2 IsValidIdChar (Char idChar)
211 {
212   if (StringChr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",idChar) != NULL)
213     return ID_GOOD_CHAR_LETTER;
214 
215   if (StringChr("abcdefghijklmnopqrstuvwxyz",idChar) != NULL)
216     return ID_GOOD_CHAR_LETTER;
217 
218   if (StringChr("0123456789",idChar) != NULL)
219     return ID_GOOD_CHAR_NUMBER;
220 
221   if (StringChr("\"._-|",idChar) != NULL)
222     return ID_GOOD_CHAR_OTHER;
223 
224   return FALSE;
225 }
226 
227 /*=========================================================================*/
228 /*                                                                         */
229 /* IsValidId ()                                                            */
230 /*                                                                         */
231 /*=========================================================================*/
232 
233 Boolean IsValidId (CharPtr idStr)
234 {
235   Int4    position;
236   Boolean letterFound = FALSE;
237   Int2    charType;
238 
239   /* Check for illegal characters */
240 
241   for (position = 0; idStr[position] != '\0'; position++)
242     {
243       charType = IsValidIdChar(idStr[position]);
244       switch (charType)
245         {
246         case ID_GOOD_CHAR_LETTER :
247           letterFound = TRUE;
248           break;
249         case ID_GOOD_CHAR_NUMBER :
250         case ID_GOOD_CHAR_OTHER :
251           break;
252         default:
253           return FALSE;
254         }
255     }
256 
257   if (!letterFound)
258     return FALSE;
259 
260   /* Check to see if ID matches a reserved word */
261 
262   if (StringICmp (idStr,"MATRIX") == 0)
263     return FALSE;
264   else if (StringICmp (idStr,"BEGIN") == 0)
265     return FALSE;
266   else if (StringICmp (idStr, "END") == 0)
267     return FALSE;
268 
269   /* If we passed all tests, mark */
270   /* it as a valid ID.            */
271 
272   return TRUE;
273 }
274 
275 /*=========================================================================*/
276 /*                                                                         */
277 /* IsNumString ()                                                          */
278 /*                                                                         */
279 /*=========================================================================*/
280 
281 Boolean IsNumString (CharPtr someStr)
282 {
283   Int4    position;
284 
285   /* Check for non-numeric characters */
286 
287   for (position = 0; someStr[position] != '\0'; position++)
288     if (StrChr ("0123456789.", someStr[position]) == NULL)
289       return FALSE;
290 
291   /* If we made it to here, all characters are */
292   /* numeric, and so the string is numeric.    */
293 
294   return TRUE;
295 }
296 
297 /*=========================================================================*/
298 /*                                                                         */
299 /* Ali_ChangeRowToOther ()                                                 */
300 /*                                                                         */
301 /*=========================================================================*/
302 
303 void Ali_ChangeRowToOther (ValNodePtr rowPtr)
304 {
305   DefLineInfoPtr   defLinePtr;
306   SeqLineInfoPtr   seqLinePtr;
307   OtherLineInfoPtr otherLinePtr;
308 
309   /* Sequence line to Other Line */
310 
311   if (rowPtr->choice == ALI_SEQLINE)
312     {
313       otherLinePtr = (OtherLineInfoPtr) MemNew (sizeof(OtherLineInfo));
314       seqLinePtr = (SeqLineInfoPtr) rowPtr->data.ptrvalue;
315 
316       if (seqLinePtr->junk != NULL)
317         sprintf(seqLinePtr->sequence,"%s%s",seqLinePtr->sequence,
318                 seqLinePtr->junk);
319 
320       if ((seqLinePtr->sequence != NULL) && (seqLinePtr->id == NULL))
321         {
322           if (IsValidId(seqLinePtr->sequence))
323             {
324               otherLinePtr->id    = seqLinePtr->sequence;
325               otherLinePtr->other = NULL;
326             }
327           else
328             {
329               otherLinePtr->id    = NULL;
330               otherLinePtr->other = seqLinePtr->sequence;
331             }
332         }
333       else
334         {
335           otherLinePtr->other = seqLinePtr->sequence;
336           otherLinePtr->id    = seqLinePtr->id;
337         }
338 
339       otherLinePtr->rowNum = seqLinePtr->rowNum;
340 
341       MemFree(seqLinePtr);
342       rowPtr->data.ptrvalue = otherLinePtr;
343       rowPtr->choice = ALI_OTHERLINE;
344     }
345 
346 
347   /* Definition line to Other line */
348 
349   else if (rowPtr->choice == ALI_DEFLINE)
350     {
351       otherLinePtr = (OtherLineInfoPtr) MemNew (sizeof(OtherLineInfo));
352       defLinePtr = (DefLineInfoPtr) rowPtr->data.ptrvalue;
353 
354       otherLinePtr->other = defLinePtr->definitions;
355       otherLinePtr->id    = defLinePtr->id;
356       otherLinePtr->rowNum   = defLinePtr->rowNum;
357 
358       MemFree(defLinePtr);
359       rowPtr->data.ptrvalue = otherLinePtr;
360       rowPtr->choice = ALI_OTHERLINE;
361     }
362 
363   /* Return successfully */
364 
365   return;
366 }
367 
368 /*=========================================================================*/
369 /*                                                                         */
370 /* ReadAlignFileLine() -                                                   */
371 /*                                                                         */
372 /*=========================================================================*/
373 
374 CharPtr ReadAlignFileLine (FILE PNTR        alignFilePtr,
375                            ErrInfoPtr PNTR  errorListPtr,
376                            AliConfigInfoPtr configPtr,
377                            Boolean    PNTR  isEOF)
378      
379 {
380   CharPtr lineStr = NULL;
381   CharPtr tempBuff = NULL;
382   Int4    totalLen = 0;
383   Int4    segmentLen = 0;
384   Int4    segmentCount = 1;
385   Boolean done = FALSE;
386   Char    ch = 0;
387 
388   /* Allocate memory for the line.  More */
389   /* can be added later as necessary.    */
390 
391   lineStr = (CharPtr) MemNew(sizeof(Char) * configPtr->readBuffSize);
392   if (lineStr == NULL)
393     {
394       Ali_AddError (errorListPtr, ERR_OUT_OF_MEMORY);
395       return NULL;
396     }
397 
398   /* Read in the characters one at a time */
399 
400   while (!done && !(ch == EOF))
401     {
402 
403       /* Process the current character */
404 
405       ch = (Char) NLM_GETC (alignFilePtr);
406 
407       if (ch == '\n')
408         {
409           done = TRUE;
410           ch = (Char) NLM_GETC (alignFilePtr);
411           if (ch != '\r') {
412             ungetc (ch, alignFilePtr);
413           
414           }
415         }
416       else if (ch == '\r') {
417         done = TRUE;
418       }
419       else
420         {
421           lineStr[totalLen] = ch;
422           segmentLen++;
423           totalLen++;
424         }
425 
426       /* Allocate more memory for the */
427       /* sequence if needed.          */
428 
429       if (segmentLen == configPtr->readBuffSize)
430         {
431           segmentCount++;
432           tempBuff = (CharPtr) MemNew(sizeof(Char) * 
433                                       segmentCount *
434                                       configPtr->readBuffSize);
435           if (tempBuff == NULL)
436             {
437               Ali_AddError (errorListPtr, ERR_OUT_OF_MEMORY);
438               MemFree (lineStr);
439               MemFree (tempBuff);
440               return NULL;
441             }
442           MemCpy(tempBuff, lineStr, (segmentCount-1) * configPtr->readBuffSize);
443           MemFree(lineStr);
444           lineStr = tempBuff;
445           segmentLen = 0;
446         }
447 
448     }
449 
450   /* Return successfully */
451 
452   if (EOF == ch)
453     *isEOF = TRUE;
454 
455   lineStr[totalLen] = '\0';
456 
457   return lineStr;
458 }
459 
460 /*=========================================================================*/
461 /*                                                                         */
462 /* s_ParseDefLine () -                                                     */
463 /*                                                                         */
464 /*=========================================================================*/
465 
466 #define DEFLINE_PRE_DATA      0
467 #define DEFLINE_DEFINITION    1
468 #define DEFLINE_SEQID         2
469 
470 static DefLineInfoPtr s_ParseDefLine (CharPtr lineStr,
471                                       Int4 rowNum,
472                                       ErrInfoPtr PNTR  errorListPtr)
473 {
474   Char           ch;
475   CharPtr        defStr;
476   CharPtr        idStr;
477   Int4           defPosition;
478   Int4           idPosition;
479   Int4           position;
480   Int2           state;
481   DefLineInfoPtr defLinePtr = NULL;
482   ErrInfoPtr     errPtr;
483 
484   defPosition = 0;
485   idPosition = 0;
486 
487   defStr = (CharPtr) MemNew (StringLen(lineStr)+1);
488   idStr  = (CharPtr) MemNew (StringLen(lineStr)+1);
489 
490   /* Parse the line character by character */
491 
492   state = DEFLINE_PRE_DATA;
493 
494   for (position = 0; lineStr[position] != '\0'; position++)
495     {
496       ch = lineStr[position];
497 
498       switch (state)
499         {
500         case DEFLINE_PRE_DATA :
501           if (IS_WHITESP(ch))
502             continue;
503           else if (ch == '>')
504             state = DEFLINE_SEQID;
505           else
506             {
507               MemFree(defStr);
508               MemFree(idStr);
509               return NULL;  /* Not a defline */
510             }
511           break;
512         case DEFLINE_SEQID : 
513           if (IsValidIdChar(ch))
514             {
515               idStr[idPosition] = ch;
516               idPosition++;
517             }
518           else if (IS_WHITESP(ch))      
519             {
520               if (idPosition > 0)
521                 {
522                   state = DEFLINE_DEFINITION;
523                   defStr[defPosition] = ch;
524                   defPosition++;
525                 }
526               else
527                 continue;
528             }
529           else if (ch == '[')
530             {
531               state = DEFLINE_DEFINITION;
532               defStr[defPosition] = ch;
533               defPosition++;
534             }
535           else
536             {
537               errPtr = Ali_AddError (errorListPtr, ERR_INVALID_DEFLINE,
538                                      lineStr, (Int4) ch);
539               errPtr->rowNum = rowNum;
540               MemFree(defStr);
541               MemFree(idStr);
542               return NULL;
543             }
544           break;
545         case DEFLINE_DEFINITION :
546           defStr[defPosition] = ch;
547           defPosition++;
548           break;
549         default:
550           break;
551         }
552     }
553 
554   /* Check for blank line */
555   
556   if (state == DEFLINE_PRE_DATA)
557     {
558       MemFree(defStr);
559       MemFree(idStr);
560       return NULL;
561     }
562 
563   idStr[idPosition]   = '\0';
564   defStr[defPosition] = '\0';
565 
566   /* Make sure that it has at least one */
567   /* set of square brackets.            */
568 
569   if ((StringChr(defStr,'[') == NULL) || (StringChr(defStr,']') == NULL))
570     {
571       errPtr = Ali_AddError (errorListPtr, ERR_DEFLINE_NODEFS, lineStr);
572       errPtr->rowNum = rowNum;
573     }
574   
575   /* If we made it to here, then */
576   /* it's a valid definition line. */
577 
578   defLinePtr = (DefLineInfoPtr) MemNew (sizeof (DefLineInfo));
579 
580   if (StringLen (defStr) != 0)
581     defLinePtr->definitions = defStr;
582   else
583     defLinePtr->definitions = NULL;
584 
585   if (StringLen (idStr) != 0)
586     defLinePtr->id = idStr;
587   else
588     defLinePtr->id = NULL;
589 
590   return defLinePtr;
591 }
592 
593 /*=========================================================================*/
594 /*                                                                         */
595 /* s_MightBeCorruptSequence ()                                             */
596 /*                                                                         */
597 /*=========================================================================*/
598 
599 static Boolean s_MightBeCorruptSequence (Int4             seqCharCount,
600                                          CharPtr          seqString,
601                                          AliConfigInfoPtr configPtr)
602 {
603   Int4    i;
604   Int4    badCharCount;
605   Int4    seqStrLen;
606   FloatLo percentGood;
607 
608   seqStrLen = StringLen(seqString);
609   badCharCount = 0;
610 
611   for (i = 0; i < seqStrLen; i++)
612     {
613       if (IsSequenceChar(seqString[i],
614                          configPtr->gapChar,
615                          configPtr->missingChar,
616                          configPtr->unalignedChar))
617         seqCharCount++;
618       else
619         badCharCount++;
620     }
621 
622   percentGood = (FloatLo) seqCharCount / ((FloatLo) seqCharCount + 
623                                          (FloatLo) badCharCount);
624 
625   if ((percentGood * 100) >= configPtr->corruptSeqThreshold)
626     return TRUE;
627   else
628     return FALSE;
629 }
630 
631 /*=========================================================================*/
632 /*                                                                         */
633 /* s_ParseSequenceLine () -                                                */
634 /*                                                                         */
635 /*=========================================================================*/
636 
637 #define PRE_DATA      0
638 #define FIRST_WORD    1
639 #define SEQUENCE_DATA 2
640 #define EOL_JUNK      3
641 #define POST_JUNK     4
642 
643 static SeqLineInfoPtr s_ParseSequenceLine (CharPtr lineStr,
644                                            AliConfigInfoPtr configPtr)
645 {
646   CharPtr        seqStr;
647   Int4           seqPosition = 0;
648   CharPtr        idStr;
649   Int4           idPosition = 0;
650   Int4           firstWordLen = 0;
651   Char           ch;
652   Int2           state = PRE_DATA;
653   Int4           position;
654   Boolean        firstWordNotSequence = FALSE;
655   Boolean        sequenceFound = FALSE;
656   CharPtr        tempStr;
657   Boolean        corruptSequence = FALSE;
658   SeqLineInfoPtr seqLinePtr;
659 
660   if (StringLen(lineStr) == 0)
661     return NULL;
662 
663   seqStr = (CharPtr) MemNew (StringLen(lineStr)+1);
664   idStr  = (CharPtr) MemNew (StringLen(lineStr)+1);
665 
666   for (position = 0; lineStr[position] != '\0'; position++)
667     {
668       ch = lineStr[position];
669 
670       switch (state)
671         {
672         case PRE_DATA :
673 
674           /* If it's the first non-whitespace char */
675           /* then we've found our first word.      */
676 
677           if (!IS_WHITESP(ch))
678             {
679               state = FIRST_WORD;
680               if (!IsSequenceChar(ch,
681                                   configPtr->gapChar,
682                                   configPtr->missingChar,
683                                   configPtr->unalignedChar))
684                 firstWordNotSequence = TRUE;
685               idStr[idPosition] = ch;
686               idPosition++;
687               firstWordLen++;
688             }
689           break;
690         case FIRST_WORD :
691           if (IS_WHITESP(ch))
692             {
693               state = SEQUENCE_DATA;
694               if ((idPosition > 0)   &&
695                   (firstWordNotSequence == FALSE))
696                 {
697                   tempStr = seqStr;
698                   seqStr  = idStr;
699                   idStr   = tempStr;
700                   seqPosition = idPosition;
701                   idPosition  = 0;
702                   sequenceFound = TRUE;
703                 }
704             }
705           else
706             {
707               /* If we find a non-sequence char in the */
708               /* first word then it might be an ID,    */
709               /* with the sequence following.          */
710               
711               if (!IsSequenceChar(ch,
712                                   configPtr->gapChar,
713                                   configPtr->missingChar,
714                                   configPtr->unalignedChar))
715                 firstWordNotSequence = TRUE;
716               idStr[idPosition] = ch;
717               idPosition++;
718               firstWordLen++;
719             }
720           break;
721         case SEQUENCE_DATA :
722           if (IS_WHITESP(ch))
723             continue;
724             
725           /* If we're in a sequence, then a non-sequence */
726           /* char invalidates it, although we do allow   */
727           /* 'junk' at the end.                          */
728           
729           if (!IsSequenceChar(ch,
730                               configPtr->gapChar,
731                               configPtr->missingChar,
732                               configPtr->unalignedChar))
733             {
734               if ((lineStr[position - 1] == ' ') && sequenceFound)
735                 state = EOL_JUNK;
736               else if ((corruptSequence == TRUE) ||
737                        (s_MightBeCorruptSequence (seqPosition,
738                                                   &(lineStr[position]),
739                                                   configPtr)))
740                 {
741                   seqStr[seqPosition] = ch;
742                   seqPosition++;
743                   sequenceFound = TRUE;
744                   corruptSequence = TRUE;
745                 }
746               else
747                 {
748                   MemFree(seqStr);
749                   MemFree(idStr);
750                   return NULL;
751                 }
752             }
753           else
754             {
755               seqStr[seqPosition] = ch;
756               seqPosition++;
757               sequenceFound = TRUE;
758             }
759           break;
760         case EOL_JUNK :
761           if (IS_WHITESP(ch))
762             state = POST_JUNK;
763           break;
764         case POST_JUNK :
765 
766           /* Only one 'word' of junk allowed */
767 
768           if (!IS_WHITESP(ch))
769             {
770               MemFree(seqStr);
771               MemFree(idStr);
772               return NULL;
773             }
774           break;
775         }
776     }
777 
778   /* Check for blank line */
779   
780   if (state == PRE_DATA)
781     {
782       MemFree(seqStr);
783       MemFree(idStr);
784       return NULL;
785     }
786   
787   if (state == FIRST_WORD)
788     {
789 
790       /* If there was just one word, and it isn't */
791       /* a sequence string, then this isn't a     */
792       /* sequence line.                           */
793       
794       if (firstWordNotSequence == TRUE)
795         {
796           MemFree(seqStr);
797           MemFree(idStr);
798           return NULL;
799         }
800       
801       /* If there was just one word, and it IS a sequence */
802       /* then the idStr is actually the seqStr.           */
803       
804       else
805         {
806           tempStr = seqStr;
807           seqStr  = idStr;
808           idStr   = tempStr;
809           seqPosition = idPosition;
810           idPosition  = 0;
811         }
812     }
813 
814   /* If still no sequence string, */
815   /* then not a sequence line.    */
816 
817   if (StringLen(seqStr) == 0)
818     {
819       MemFree(seqStr);
820       MemFree(idStr);
821       return NULL;
822     }
823 
824   /* Check to see if the ID is a valid one */
825 
826   idStr[idPosition]   = '\0';
827   seqStr[seqPosition] = '\0';
828 
829   if ((idPosition > 0) &&
830       (IsValidId (idStr) == FALSE) &&
831       (IsNumString (idStr) == FALSE))
832     {
833       MemFree(idStr);
834       MemFree(seqStr);
835       return NULL;
836     }
837 
838   /* If we made it to here, then */
839   /* it's a valid sequence line. */
840 
841   seqLinePtr = (SeqLineInfoPtr) MemNew (sizeof (SeqLineInfo));
842 
843   seqLinePtr->firstWordLen = firstWordLen;
844 
845   if (StringLen (seqStr) != 0)
846     seqLinePtr->sequence = seqStr;
847   else
848     {
849       seqLinePtr->sequence = NULL;
850       MemFree (seqStr);
851     }
852 
853   if (StringLen (idStr) != 0)
854     seqLinePtr->id = idStr;
855   else
856     {
857       seqLinePtr->id = NULL;
858       MemFree (idStr);
859     }
860 
861   if (corruptSequence)
862     seqLinePtr->maybe  = TRUE;
863   else
864     seqLinePtr->maybe  = FALSE;
865 
866   return seqLinePtr;
867 }
868 
869 /*=========================================================================*/
870 /*                                                                         */
871 /* s_ParseOtherLine () -                                                   */
872 /*                                                                         */
873 /*=========================================================================*/
874 
875 #define OTHER_PRE_DATA  0
876 #define OTHER_DATA      1
877 
878 static OtherLineInfoPtr s_ParseOtherLine (CharPtr lineStr)
879 {
880   Char             ch;
881   CharPtr          otherStr;
882   Int4             otherPosition;
883   Int4             position;
884   Int2             state;
885   Int4             wordCount;
886   OtherLineInfoPtr otherLinePtr;
887 
888   /* Parse the line character by character */
889 
890   otherStr = (CharPtr) MemNew (StringLen(lineStr)+1);
891   otherPosition = 0;
892   state     = OTHER_PRE_DATA;
893   wordCount = 0;
894 
895   for (position = 0; lineStr[position] != '\0'; position++)
896     {
897       ch = lineStr[position];
898 
899       switch (state)
900         {
901         case OTHER_PRE_DATA :
902           if (IS_WHITESP(ch))
903             continue;
904           else
905             {
906               wordCount = 1;
907               state = OTHER_DATA;
908               otherStr[otherPosition] = ch;
909               otherPosition++;
910             }
911           break;
912         case OTHER_DATA : 
913           if (IS_WHITESP(ch))   
914             wordCount++;
915           otherStr[otherPosition] = ch;
916           otherPosition++;
917           break;
918         default:
919           break;
920         }
921     }
922 
923   /* Check for blank line */
924   
925   if (state == OTHER_PRE_DATA)
926     {
927       MemFree(otherStr);
928       return NULL;
929     }
930   
931   /* If we made it to here, then */
932   /* it's a valid definition line. */
933 
934   otherStr[otherPosition]   = '\0';
935 
936   otherLinePtr = (OtherLineInfoPtr) MemNew (sizeof (OtherLineInfo));
937   if ((wordCount == 1) && IsValidId(otherStr))
938     {
939       otherLinePtr->id    = otherStr;
940       otherLinePtr->other = NULL;
941     }
942   else
943     {
944       otherLinePtr->id    = NULL;
945       otherLinePtr->other = otherStr;
946     }
947 
948   /* Return successfully */
949 
950   return otherLinePtr;
951 }
952 
953 /*=========================================================================*/
954 /*                                                                         */
955 /* s_OtherGetValue ()                                                      */
956 /*                                                                         */
957 /*=========================================================================*/
958 
959 CharPtr s_OtherGetValue (CharPtr otherStr)
960 {
961   CharPtr tempStrPtr;
962   CharPtr valueBuff;
963   Int2    charCount;
964 
965   /* Go to the first character after the '=' */
966 
967   if ((tempStrPtr = StringChr (otherStr, '=')) == NULL)
968     return NULL;
969   tempStrPtr++;
970 
971   /* Skip spaces */
972 
973   while (*tempStrPtr == ' ')
974     tempStrPtr++;
975 
976   if (*tempStrPtr == '\0')
977     return NULL;
978 
979   /* Get the value */
980 
981   valueBuff = (CharPtr) MemNew (ALI_MAX_LINE_LEN);
982   charCount = 0;
983 
984   while ((*tempStrPtr != ' ') && (*tempStrPtr != '\0'))
985     {
986       valueBuff[charCount] = *tempStrPtr;
987       charCount++;
988       tempStrPtr++;
989     }
990 
991   valueBuff[charCount] = '\0';
992 
993   /* Return successfully */
994 
995   return valueBuff;
996 }
997 
998 /*=========================================================================*/
999 /*                                                                         */
1000 /* s_ProcessOtherLine ()                                                   */
1001 /*                                                                         */
1002 /*=========================================================================*/
1003 
1004 static Boolean s_ProcessOtherLine (AliConfigInfoPtr configPtr,
1005                                    CharPtr          otherStr,
1006                                    AlignFileDataPtr fileInfoPtr)
1007 {
1008   CharPtr strPtr;
1009   CharPtr tmpStr;
1010 
1011   /* Check for datatype declaration */
1012 
1013   if (((strPtr = StringStr (otherStr, "datatype")) != NULL) ||
1014       ((strPtr = StringStr (otherStr, "DATATYPE")) != NULL) ||
1015       ((strPtr = StringStr (otherStr, "Datatype")) != NULL))
1016     {
1017       tmpStr = s_OtherGetValue(strPtr);
1018       if (StringICmp (tmpStr, "DNA") == 0)
1019         configPtr->declaredInfo.dataType = ALI_DATA_NUCLEOTIDE;
1020       else
1021         configPtr->declaredInfo.dataType = ALI_DATA_PROTEIN;
1022       MemFree (tmpStr);
1023     }
1024 
1025   /* Check for interleaved/contiguous */
1026 
1027   if (((strPtr = StringStr (otherStr, "INTERLEAVED")) != NULL) ||
1028       ((strPtr = StringStr (otherStr, "interleaved")) != NULL) ||
1029       ((strPtr = StringStr (otherStr, "Interleaved")) != NULL))
1030     configPtr->declaredInfo.contigOrInter = ALI_INTERLEAVED;
1031   else if (((strPtr = StringStr (otherStr, "CONTIGUOUS")) != NULL) ||
1032            ((strPtr = StringStr (otherStr, "contiguous")) != NULL) ||
1033            ((strPtr = StringStr (otherStr, "Contiguous")) != NULL))
1034     configPtr->declaredInfo.contigOrInter = ALI_CONTIGUOUS;
1035 
1036   /* Check for dimensions */
1037 
1038   if (((strPtr = StringStr (otherStr, "NTAX")) != NULL) ||
1039       ((strPtr = StringStr (otherStr, "ntax")) != NULL) ||
1040       ((strPtr = StringStr (otherStr, "nTax")) != NULL))
1041     {
1042       tmpStr = s_OtherGetValue (strPtr);
1043       configPtr->declaredInfo.idCount = atoi(tmpStr);
1044       MemFree (tmpStr);
1045     }
1046 
1047   if (((strPtr = StringStr (otherStr, "NCHAR")) != NULL) ||
1048       ((strPtr = StringStr (otherStr, "nchar")) != NULL) ||
1049       ((strPtr = StringStr (otherStr, "nChar")) != NULL))
1050     {
1051       tmpStr = s_OtherGetValue (strPtr);
1052       configPtr->declaredInfo.seqLength = atoi(tmpStr);
1053       MemFree (tmpStr);
1054     }
1055 
1056   /* Check for definition of missing character */
1057   
1058   if (((strPtr = StringStr (otherStr, "MISSING")) != NULL) ||
1059       ((strPtr = StringStr (otherStr, "missing")) != NULL) ||
1060       ((strPtr = StringStr (otherStr, "Missing")) != NULL))
1061     {
1062       tmpStr = s_OtherGetValue(strPtr);
1063       configPtr->missingChar = (CharPtr) MemNew (2);
1064       sprintf (configPtr->missingChar, "%c", tmpStr[0]);
1065       MemFree (tmpStr);
1066 
1067       /* If the new missing char conflicts with the */
1068       /* gap or unaligned char, then blank them out */
1069       /* to give the new one precedence.            */
1070 
1071       if (StringICmp (configPtr->missingChar, configPtr->gapChar) == 0)
1072         StringCpy (configPtr->gapChar, "");
1073 
1074       if (StringICmp (configPtr->missingChar, configPtr->unalignedChar) == 0)
1075         StringCpy (configPtr->unalignedChar, "");
1076         
1077     }
1078   
1079   /* Check for definition of gap character */
1080   
1081   if (((strPtr = StringStr (otherStr, "GAP")) != NULL) ||
1082       ((strPtr = StringStr (otherStr, "gap")) != NULL) ||
1083       ((strPtr = StringStr (otherStr, "Gap")) != NULL))
1084     {
1085       tmpStr = s_OtherGetValue(strPtr);
1086       configPtr->gapChar = (CharPtr) MemNew (2);
1087       sprintf (configPtr->gapChar, "%c", tmpStr[0]);
1088       MemFree (tmpStr);
1089 
1090       /* If the new gap char conflicts with the missing */
1091       /* or unaligned char, then blank them out to give */
1092       /* the new one precedence.                        */
1093 
1094       if (StringICmp (configPtr->gapChar, configPtr->missingChar) == 0)
1095         StringCpy (configPtr->missingChar, "");
1096 
1097       if (StringICmp (configPtr->gapChar, configPtr->unalignedChar) == 0)
1098         StringCpy (configPtr->unalignedChar, "");
1099         
1100     }
1101   
1102   /* Check for definition of unaligned character */
1103   
1104   if (((strPtr = StringStr (otherStr, "UNALIGNED")) != NULL) ||
1105       ((strPtr = StringStr (otherStr, "unaligned")) != NULL) ||
1106       ((strPtr = StringStr (otherStr, "Unaligned")) != NULL))
1107     {
1108       tmpStr = s_OtherGetValue(strPtr);
1109       configPtr->unalignedChar = (CharPtr) MemNew (2);
1110       sprintf (configPtr->unalignedChar, "%c", tmpStr[0]);
1111       MemFree (tmpStr);
1112 
1113       /* If the new unaligned char conflicts with the */
1114       /* gap or missing char, then blank them out to  */
1115       /* give the new one precedence.                 */
1116 
1117       if (StringICmp (configPtr->unalignedChar, configPtr->gapChar) == 0)
1118         StringCpy (configPtr->gapChar, "");
1119 
1120       if (StringICmp (configPtr->unalignedChar, configPtr->missingChar) == 0)
1121         StringCpy (configPtr->missingChar, "");
1122         
1123     }
1124   
1125   /* Return successfully */
1126   
1127   return TRUE;
1128 }
1129 
1130 /*=========================================================================*/
1131 /*                                                                         */
1132 /* Ali_ReadLines ()                                                        */
1133 /*                                                                         */
1134 /*=========================================================================*/
1135 
1136 ValNodePtr Ali_ReadLines (FILE PNTR        alignFilePtr,
1137                           ErrInfoPtr PNTR  errorListPtr,
1138                           AliConfigInfoPtr configPtr,
1139                           AlignFileDataPtr fileInfoPtr)
1140 {
1141   CharPtr          lineStr = NULL;
1142   ValNodePtr       rowList = NULL;
1143   ValNodePtr       newRow;
1144   SeqLineInfoPtr   seqLine;
1145   SeqLineInfoPtr   reEvalSeqPtr;
1146   DefLineInfoPtr   defLine;
1147   OtherLineInfoPtr otherLine;
1148   Boolean          nextRowMustBeSeq;
1149   Boolean          idFound;
1150   Boolean          lastRowWasOther = FALSE;
1151   Int4             rowNum;
1152   ErrInfoPtr       errPtr;
1153   Boolean          isEOF;
1154 
1155   nextRowMustBeSeq = FALSE;
1156   rowNum = 0;
1157   isEOF = FALSE;
1158 
1159   while (FALSE == isEOF)
1160     {
1161 
1162       /* Process the line according to its content ... */
1163 
1164       lineStr = ReadAlignFileLine(alignFilePtr, errorListPtr,
1165                                   configPtr, &isEOF);
1166       if (lineStr == NULL) {
1167         return NULL;
1168       }
1169 
1170       rowNum++;
1171 
1172       /* ... DefLine */
1173       
1174       if ((defLine = s_ParseDefLine(lineStr, rowNum, errorListPtr)) != NULL)
1175         {
1176           defLine->rowNum = rowNum;
1177           lastRowWasOther = FALSE;
1178           if (nextRowMustBeSeq)
1179             nextRowMustBeSeq = FALSE;
1180 
1181           /* If we found an ID, then the next */
1182           /* row must have a sequence.        */
1183 
1184           if ((defLine->id != NULL) && (StringLen(defLine->id) != 0))
1185             nextRowMustBeSeq = TRUE;
1186           else
1187             nextRowMustBeSeq = FALSE;
1188 
1189           /* Add a record for the defline */
1190 
1191           newRow = ValNodeAdd(&rowList);
1192           if (NULL == newRow)
1193             {
1194               errPtr = Ali_AddError (errorListPtr, ERR_OUT_OF_MEMORY);
1195               errPtr->rowNum = rowNum;
1196               return NULL;
1197             }
1198           
1199           newRow->choice = ALI_DEFLINE;
1200           newRow->data.ptrvalue = defLine;
1201 
1202         }
1203       
1204       /* ... Sequence Data */
1205 
1206       else if ((seqLine = s_ParseSequenceLine(lineStr, configPtr))
1207                != NULL)
1208         {
1209           seqLine->rowNum = rowNum;
1210 
1211           /* Is it a Nucleotide sequence or a Protein sequence? */
1212 
1213           seqLine->type = Ali_SeqLineGetType(seqLine->sequence, configPtr);
1214 
1215           /* Add a record for the sequence */
1216               
1217           newRow = ValNodeAdd(&rowList);
1218           if (NULL == newRow)
1219             {
1220               errPtr = Ali_AddError (errorListPtr, ERR_OUT_OF_MEMORY);
1221               errPtr->rowNum = rowNum;
1222               return NULL;
1223             }
1224           
1225           newRow->data.ptrvalue = seqLine;
1226 
1227           /* Mark it as a sequence line */
1228           
1229           if ((seqLine->maybe == FALSE) ||
1230               ((seqLine->maybe == TRUE) && (configPtr->useMaybes == TRUE)))
1231             {
1232               
1233               if (StringLen(seqLine->id) != 0)
1234                 lastRowWasOther = FALSE;
1235               
1236               newRow->choice = ALI_SEQLINE;
1237               
1238               if (nextRowMustBeSeq)
1239                 nextRowMustBeSeq = FALSE;
1240               
1241               /* A sequence must follow either a defline, */
1242               /* an ID, or another sequence.              */
1243               
1244               if (lastRowWasOther == TRUE)
1245                 {
1246                   reEvalSeqPtr = SeqLineReEval (seqLine);
1247                   if (NULL == reEvalSeqPtr)
1248                     Ali_ChangeRowToOther(newRow);
1249                   else
1250                     newRow->data.ptrvalue = reEvalSeqPtr;
1251                 }
1252             }
1253           else  /* A 'maybe' sequence that we're not using */
1254             {
1255               Ali_ChangeRowToOther(newRow);
1256               lastRowWasOther = TRUE;
1257             }
1258         }      
1259 
1260       /* ... Other */
1261       
1262       else
1263         {
1264           if (StringLen(lineStr) > 0)
1265             {
1266               if ((otherLine = s_ParseOtherLine(lineStr)) != NULL)
1267                 {
1268                   otherLine->rowNum = rowNum;
1269                   if (otherLine->id != NULL)
1270                     {
1271                       idFound = TRUE;
1272                       lastRowWasOther = FALSE;
1273                     }
1274                   else
1275                     {
1276                       idFound = FALSE;
1277                       lastRowWasOther = TRUE;
1278                     }
1279 
1280                   newRow = ValNodeAdd(&rowList);
1281                   if (NULL == newRow)
1282                     {
1283                       errPtr = Ali_AddError (errorListPtr,
1284                                              ERR_OUT_OF_MEMORY);
1285                       errPtr->rowNum = rowNum;
1286                       return NULL;
1287                     }
1288                   
1289                   newRow->choice = ALI_OTHERLINE;
1290                   newRow->data.ptrvalue = otherLine;
1291 
1292                   /* If the next row needs to be a Sequence, */
1293                   /* and we're not still on the same row,    */
1294                   /* then change the previous ID to other.   */
1295                   
1296                   if (nextRowMustBeSeq && !idFound)
1297                     nextRowMustBeSeq = FALSE;
1298                   
1299                   if (idFound)
1300                     nextRowMustBeSeq = TRUE;
1301                   else
1302                     nextRowMustBeSeq = FALSE;
1303 
1304                   /* Attempt to parse any configuration */
1305                   /* information from the line.         */
1306 
1307                   if (otherLine->other != NULL)
1308                     s_ProcessOtherLine (configPtr, otherLine->other, fileInfoPtr);
1309   
1310                 }
1311             }
1312         }
1313       MemFree (lineStr);
1314     }
1315 
1316   return rowList;
1317 }
1318 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.