NCBI C Toolkit Cross Reference

C/sequin/sequin2.c


  1 /*   sequin2.c
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *            National Center for Biotechnology Information (NCBI)
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's official duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government do not place any restriction on its use or reproduction.
 13 *  We would, however, appreciate having the NCBI and the author cited in
 14 *  any work or product based on this material
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 * ===========================================================================
 25 *
 26 * File Name:  sequin2.c
 27 *
 28 * Author:  Jonathan Kans
 29 *
 30 * Version Creation Date:   1/22/95
 31 *
 32 * $Revision: 6.670 $
 33 *
 34 * File Description: 
 35 *
 36 * Modifications:  
 37 * --------------------------------------------------------------------------
 38 * Date     Name        Description of modification
 39 * -------  ----------  -----------------------------------------------------
 40 *
 41 *
 42 * ==========================================================================
 43 */
 44 
 45 #include "sequin.h"
 46 #include <document.h>
 47 #include <sequtil.h>
 48 #include <biosrc.h>
 49 #include <cdrgn.h>
 50 #include <seqsub.h>
 51 #include <tofasta.h>
 52 #include <gather.h>
 53 #include <subutil.h>
 54 #include <suggslp.h>
 55 #include <toasn3.h>
 56 #include <toporg.h>
 57 #include <salfiles.h>
 58 #include <salsap.h>
 59 #include <salign.h>
 60 #include <edutil.h>
 61 #include <vsm.h>
 62 //#include <accentr.h>
 63 //#include <accutils.h>
 64 #include <pmfapi.h>
 65 #include <explore.h>
 66 #include <aliparse.h>
 67 #include <algo/blast/api/twoseq_api.h>
 68 #ifdef WIN_MOTIF
 69 #include <netscape.h>
 70 #endif
 71 #include <actutils.h>
 72 #include <salpanel.h>
 73 #include <findrepl.h>
 74 #include <macrodlg.h>
 75 #include <macroapi.h>
 76 
 77 extern EnumFieldAssoc  biosource_genome_simple_alist [];
 78 extern EnumFieldAssoc  biosource_origin_alist [];
 79 
 80 static ENUM_ALIST(biomol_nucX_alist)
 81   {"Genomic DNA",            253},
 82   {"Genomic RNA",            254},
 83   {"Precursor RNA",            2},
 84   {"mRNA [cDNA]",              3},
 85   {"Ribosomal RNA",            4},
 86   {"Transfer RNA",             5},
 87   {"Small nuclear RNA",        6},
 88   {"Small cytoplasmic RNA",    7},
 89   {"Other-Genetic",            9},
 90   {"cRNA",                    11},
 91   {"Small nucleolar RNA",     12},
 92   {"Transcribed RNA",         13}, 
 93   {"Transfer-messenger RNA", MOLECULE_TYPE_TMRNA },
 94 END_ENUM_ALIST
 95 
 96 static ENUM_ALIST(biomol_nucGen_alist)
 97   {"Genomic DNA",            253},
 98   {"Genomic RNA",            254},
 99 END_ENUM_ALIST
100 
101 static ENUM_ALIST(topology_nuc_alist)
102 {"Linear",          TOPOLOGY_LINEAR},
103 {"Circular",        TOPOLOGY_CIRCULAR},
104 END_ENUM_ALIST
105 
106 static ENUM_ALIST(molecule_alist)
107 {"DNA",             Seq_mol_dna },
108 {"RNA",             Seq_mol_rna },
109 END_ENUM_ALIST
110 
111 #define PRINTED_INT_MAX_LEN 15
112 
113 #define CREATE_FASTA_REQUIRED 0
114 #define CREATE_FASTA_WARNING  1
115 
116 /* This structure holds a list of IDs and titles for a set of sequences.
117  * It can be used to represent the new list of sequences being imported,
118  * the existing list of sequences being imported, suggested changes for
119  * a list of sequences, etc.
120  */
121 typedef struct idandtitleedit 
122 {
123   CharPtr PNTR id_list;
124   CharPtr PNTR title_list;
125   BoolPtr      is_seg;
126   Int4         num_sequences;
127   Boolean      nuc_only;
128 } IDAndTitleEditData, PNTR IDAndTitleEditPtr;
129 
130 /* These functions are for creating, copying, and freeing lists
131  * of titles and IDs.
132  */
133 static IDAndTitleEditPtr IDAndTitleEditNew (void)
134 {
135   IDAndTitleEditPtr iatep;
136   
137   iatep = (IDAndTitleEditPtr) MemNew (sizeof (IDAndTitleEditData));
138   if (iatep != NULL)
139   {
140     iatep->id_list = NULL;
141     iatep->title_list = NULL;
142     iatep->is_seg = NULL;
143     iatep->num_sequences = 0;
144     iatep->nuc_only = FALSE;
145   }
146   return iatep;
147 }
148 
149 static void IDAndTitleEditInit (IDAndTitleEditPtr iatep, Int4 new_num_sequences)
150 {
151   Int4 seq_num;
152   if (iatep == NULL)
153   {
154     return;
155   }
156   
157   /* free old lists, if any */
158   for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
159   {
160     iatep->id_list [seq_num] = MemFree (iatep->id_list [seq_num]);
161     iatep->title_list [seq_num] = MemFree (iatep->title_list [seq_num]);
162   }
163   iatep->id_list = MemFree (iatep->id_list);
164   iatep->title_list = MemFree (iatep->title_list);
165   iatep->is_seg = MemFree (iatep->is_seg);
166   
167   /* now create blanks for num_sequences entries */
168   iatep->num_sequences = MAX (0, new_num_sequences);
169   if (iatep->num_sequences > 0)
170   {
171     iatep->id_list = (CharPtr PNTR) MemNew (iatep->num_sequences * sizeof (CharPtr));
172     iatep->title_list = (CharPtr PNTR) MemNew (iatep->num_sequences * sizeof (CharPtr));
173     iatep->is_seg = (BoolPtr) MemNew (iatep->num_sequences * sizeof (Boolean));
174     for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
175     {
176       iatep->id_list [seq_num] = NULL;
177       iatep->title_list  [seq_num] = NULL;
178       iatep->is_seg [seq_num] = FALSE;
179     }
180   } 
181 }
182 
183 static IDAndTitleEditPtr IDAndTitleEditCopy (IDAndTitleEditPtr iatep_orig)
184 {
185   IDAndTitleEditPtr iatep_copy;
186   Int4              seq_num;
187   
188   if (iatep_orig == NULL)
189   {
190     return NULL;
191   }
192   
193   iatep_copy = IDAndTitleEditNew ();
194   if (iatep_copy == NULL)
195   {
196     return NULL;
197   }
198   
199   IDAndTitleEditInit (iatep_copy, iatep_orig->num_sequences);
200   for (seq_num = 0; seq_num < iatep_copy->num_sequences; seq_num++)
201   {
202     iatep_copy->id_list [seq_num] = StringSave (iatep_orig->id_list [seq_num]);
203     iatep_copy->title_list [seq_num] = StringSave (iatep_orig->title_list [seq_num]);
204     if (iatep_orig->is_seg != NULL)
205     {
206       iatep_copy->is_seg [seq_num] = iatep_orig->is_seg [seq_num];
207     }
208   }
209   
210   return iatep_copy;
211 }
212 
213 static IDAndTitleEditPtr IDAndTitleEditFree (IDAndTitleEditPtr iatep)
214 {
215   Int4 i;
216   
217   if (iatep != NULL)
218   {
219     for (i = 0; i < iatep->num_sequences; i++)
220     {
221       iatep->id_list [i] = MemFree (iatep->id_list [i]);
222       iatep->title_list [i] = MemFree (iatep->title_list [i]);
223     }
224     iatep->id_list = MemFree (iatep->id_list);
225     iatep->title_list = MemFree (iatep->title_list);
226     iatep->is_seg = MemFree (iatep->is_seg);
227     iatep = MemFree (iatep);
228   }
229   return iatep;
230 }
231 
232 /* These functions are for applying lists of titles and IDs
233  * to a SeqEntry list.
234  */
235 static Int4 CountSequencesAndSegments (SeqEntryPtr list, Boolean nuc_only)
236 {
237   Int4         num_seqs = 0;
238   BioseqSetPtr bssp;
239   BioseqPtr    bsp;
240   
241   while (list != NULL)
242   {
243     if (list->data.ptrvalue != NULL)
244     {
245       if (IS_Bioseq (list))
246       {
247         bsp = (BioseqPtr) list->data.ptrvalue;
248         if (!nuc_only || ISA_na (bsp->mol)) {
249           num_seqs ++;
250         }
251       }
252       else if (IS_Bioseq_set (list))
253       {
254         bssp = (BioseqSetPtr) list->data.ptrvalue;
255         num_seqs += CountSequencesAndSegments (bssp->seq_set, nuc_only);
256       }
257     }
258     list = list->next;
259   }
260   return num_seqs;
261 }
262 
263 static BioseqPtr FindNthSequenceInSet (SeqEntryPtr seq_list, Int4 nth, BoolPtr is_seg, Boolean nuc_only)
264 {
265   Int4         pos = 0;
266   BioseqPtr    bsp = NULL;
267   BioseqSetPtr bssp;
268   SeqEntryPtr  sep;
269   
270   while (seq_list != NULL && bsp == NULL)
271   {
272     if (seq_list->data.ptrvalue != NULL)
273     {
274       if (IS_Bioseq (seq_list) && seq_list->data.ptrvalue != NULL 
275           && (!nuc_only || ISA_na(((BioseqPtr)seq_list->data.ptrvalue)->mol)))
276       {
277         if (nth == pos)
278         {
279           bsp = seq_list->data.ptrvalue;
280         }
281         else
282         {
283           pos ++;
284         }
285       }
286       else if (IS_Bioseq_set (seq_list))
287       {
288         bssp = (BioseqSetPtr) seq_list->data.ptrvalue;
289         if (bssp->_class == BioseqseqSet_class_parts && is_seg != NULL)
290         {
291           *is_seg = TRUE;
292         }
293         sep = bssp->seq_set;
294         while (sep != NULL && bsp == NULL)
295         {
296           bsp = FindNthSequenceInSet (sep, nth - pos, is_seg, nuc_only);
297           if (bsp == NULL)
298           {
299             if (IS_Bioseq_set (sep))
300             {
301               bssp = (BioseqSetPtr) sep->data.ptrvalue;
302               pos += CountSequencesAndSegments (bssp->seq_set, nuc_only);
303             }
304             else if (IS_Bioseq (sep) && (!nuc_only || ISA_na (((BioseqPtr)(sep->data.ptrvalue))->mol)))
305             {
306               pos ++;
307             }
308           }
309           sep = sep->next;
310         }
311         if (bsp == NULL && is_seg != NULL)
312         {
313           *is_seg = FALSE;
314         }
315       }      
316     }
317     seq_list = seq_list->next;
318   }
319   return bsp;
320 }
321 
322 static IDAndTitleEditPtr SeqEntryListToIDAndTitleEditEx (SeqEntryPtr list, Boolean nuc_only)
323 {
324   IDAndTitleEditPtr iatep;
325   Int4              num_sequences, i;
326   BioseqPtr         bsp;
327   SeqDescrPtr       sdp;
328   SeqIdPtr          sip;
329   
330   num_sequences = CountSequencesAndSegments (list, nuc_only);
331   if (num_sequences == 0)
332   {
333     return NULL;
334   }
335   
336   iatep = IDAndTitleEditNew ();
337   if (iatep == NULL)
338   {
339     return NULL;
340   }
341 
342   iatep->nuc_only = nuc_only;
343   IDAndTitleEditInit (iatep, num_sequences);
344 
345   for (i = 0; i < num_sequences; i++)
346   {
347     bsp = FindNthSequenceInSet (list, i, &(iatep->is_seg [i]), nuc_only);
348     if (bsp != NULL)
349     {
350       sip = SeqIdFindBest (bsp->id, SEQID_GENBANK);
351       if (sip != NULL)
352       {
353         if (sip->choice == SEQID_LOCAL) {
354           iatep->id_list [i] = SeqIdWholeLabel (sip, PRINTID_REPORT);
355         } else {
356           iatep->id_list [i] = SeqIdWholeLabel (sip, PRINTID_FASTA_SHORT);
357         }
358       }
359       sdp = bsp->descr;
360       while (sdp != NULL && sdp->choice != Seq_descr_title)
361       {
362         sdp = sdp->next;
363       }
364       if (sdp != NULL && !StringHasNoText (sdp->data.ptrvalue))
365       {
366         iatep->title_list [i] = StringSave (sdp->data.ptrvalue);
367       }
368     }
369   }
370   return iatep;
371 }
372 static IDAndTitleEditPtr SeqEntryListToIDAndTitleEdit (SeqEntryPtr list)
373 {
374   return SeqEntryListToIDAndTitleEditEx (list, FALSE);
375 }
376 
377 static void ReplaceIDAndTitleForBioseq (BioseqPtr bsp, SeqIdPtr new_sip, CharPtr title)
378 {
379   SeqDescrPtr sdp;
380   SeqEntryPtr sep;
381   
382   if (bsp == NULL)
383   {
384     return;
385   }
386   
387   /* replace ID */
388   
389   if (new_sip != NULL)
390   {
391     if (bsp->id != NULL)
392     {
393       new_sip->next = bsp->id->next;
394       bsp->id->next = NULL;
395       bsp->id = SeqIdFree (bsp->id);
396     }
397     bsp->id = new_sip;
398     SeqMgrReplaceInBioseqIndex(bsp);
399   }
400   else
401   {
402     bsp->id = SeqIdFree (bsp->id);
403   }
404     
405   /* replace title */
406   if (title == NULL)
407   {
408     title = StringSave ("");
409   }
410   sdp = bsp->descr;
411   while (sdp != NULL && sdp->choice != Seq_descr_title)
412   {
413     sdp = sdp->next;
414   }
415   if (sdp == NULL)
416   {
417     sep = SeqMgrGetSeqEntryForData (bsp);
418     sdp = CreateNewDescriptor (sep, Seq_descr_title);
419     sdp->data.ptrvalue = title;
420   }
421   else
422   {
423     sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
424     sdp->data.ptrvalue = title;
425   }  
426 }
427 
428 static void ResetSegSetIDLists (SeqEntryPtr list)
429 {
430   BioseqSetPtr bssp, parts;
431   BioseqPtr    seg_bsp;
432   SeqEntryPtr  sep;
433   SeqLocPtr    loc, next_loc, last_loc;
434   
435   if (list == NULL)
436   {
437     return;
438   }
439   
440   if (list->data.ptrvalue != NULL)
441   {
442     if (IS_Bioseq_set (list))
443     {
444       bssp = (BioseqSetPtr) list->data.ptrvalue;
445       if (bssp->_class == BioseqseqSet_class_segset)
446       {
447         sep = bssp->seq_set;
448         seg_bsp = NULL;
449         parts = NULL;
450         while (sep != NULL && (seg_bsp == NULL || parts == NULL))
451         {
452           if (IS_Bioseq (sep))
453           {
454             seg_bsp = sep->data.ptrvalue;
455           }
456           else if (IS_Bioseq_set (sep))
457           {
458             parts = sep->data.ptrvalue;
459             if (parts != NULL && parts->_class != BioseqseqSet_class_parts)
460             {
461               parts = NULL;
462             }
463           }
464           sep = sep->next;
465         }
466         if (seg_bsp != NULL)
467         {
468           /* remove old location */
469           loc = (SeqLocPtr) seg_bsp->seq_ext;
470           while (loc != NULL)
471           {
472             next_loc = loc->next;
473             loc->next = NULL;
474             loc = SeqLocFree (loc);
475             loc = next_loc;
476           }
477           seg_bsp->seq_ext = NULL;
478           /* put in new locations */
479           sep = parts->seq_set;
480           last_loc = NULL;
481           while (sep != NULL)
482           {
483             if (IS_Bioseq (sep) && sep->data.ptrvalue != NULL)
484             {
485               loc = SeqLocWholeNew (sep->data.ptrvalue);
486               if (loc != NULL)
487               {
488                 if (last_loc == NULL)
489                 {
490                   seg_bsp->seq_ext = loc;
491                 }
492                 else
493                 {
494                   last_loc->next = loc;
495                 }
496                 last_loc = loc;
497               }
498             }
499             sep = sep->next;
500           }
501         }
502       }
503       else
504       {
505         ResetSegSetIDLists (bssp->seq_set);
506       }
507     }
508   }
509   ResetSegSetIDLists (list->next);
510 }
511 
512 
513 static Boolean ApplyIDAndTitleEditToSeqEntryList (SeqEntryPtr list, IDAndTitleEditPtr iatep)
514 {
515   Int4      i;
516   SeqIdPtr  new_sip;
517   BioseqPtr bsp;
518   
519   if (list == NULL || iatep == NULL)
520   {
521     return FALSE;
522   }
523   
524   if (CountSequencesAndSegments (list, iatep->nuc_only) != iatep->num_sequences)
525   {
526     return FALSE;
527   }
528   
529   for (i = 0; i < iatep->num_sequences; i++)
530   {
531     bsp = FindNthSequenceInSet (list, i, NULL, iatep->nuc_only);
532     if (bsp != NULL)
533     {
534       new_sip = NULL;
535       if (StringChr (iatep->id_list[i], '|') != NULL) {
536         new_sip = SeqIdParse (iatep->id_list[i]);
537       }
538       if (new_sip == NULL) {
539         new_sip = MakeSeqID (iatep->id_list [i]);
540       }
541       ReplaceIDAndTitleForBioseq (bsp, new_sip, StringSave (iatep->title_list [i]));
542     }
543   }
544   ResetSegSetIDLists (list);
545   return TRUE;
546 }
547 
548 /* this section of code is used to read and parse the taxlist.txt 
549  * and lineages.txt files */
550 static ValNodePtr orglist = NULL;
551 
552 typedef struct orginfo 
553 {
554   CharPtr taxname;
555   CharPtr common;
556   Int4    ngcode;
557   Int4    mgcode;
558   CharPtr div;
559   Int4    taxnum;
560   CharPtr lineage;
561 } OrgInfoData, PNTR OrgInfoPtr;
562 
563 static FILE *OpenSequinDataFile (CharPtr filename)
564 {
565   Char              str [PATH_MAX];
566   CharPtr           ptr;
567   FILE              *f = NULL;
568 
569   if (StringHasNoText (filename))
570   {
571     return NULL;
572   }
573 
574   ProgramPath (str, sizeof (str));
575   ptr = StringRChr (str, DIRDELIMCHR);
576   if (ptr == NULL)
577   {
578     return NULL;
579   }
580   
581   *ptr = '\0';
582   FileBuildPath (str, NULL, filename);
583   f = FileOpen (str, "r");
584   if (f == NULL) {
585     if (GetAppParam ("NCBI", "NCBI", "DATA", "", str, sizeof (str))) {
586       FileBuildPath (str, NULL, filename);
587       f = FileOpen (str, "r");
588     }
589   }
590   return f;
591 }
592 
593 static OrgInfoPtr FindByTaxNum (Int4 taxnum)
594 {
595   ValNodePtr vnp;
596   OrgInfoPtr oip;
597   
598   for (vnp = orglist; vnp != NULL; vnp = vnp->next)
599   {
600     oip = (OrgInfoPtr) vnp->data.ptrvalue;
601     if (oip != NULL && oip->taxnum == taxnum)
602     {
603       return oip;
604     }
605   }
606   return NULL;
607 }
608 
609 static OrgInfoPtr FindByTaxName (CharPtr taxname)
610 {
611   ValNodePtr vnp;
612   OrgInfoPtr oip;
613   
614   if (StringHasNoText (taxname))
615   {
616     return NULL;
617   }
618   
619   for (vnp = orglist; vnp != NULL; vnp = vnp->next)
620   {
621     oip = (OrgInfoPtr) vnp->data.ptrvalue;
622     if (oip != NULL && StringICmp (oip->taxname, taxname) == 0)
623     {
624       return oip;
625     }
626   }
627   return NULL;
628 }
629 
630 static void AddLineagesToOrganismList (void)
631 {
632   ReadBufferData    rbd;
633   CharPtr           line;
634   CharPtr           ptr;
635   FILE              *f;
636   OrgInfoPtr        oip;
637   Int4              taxnum;
638 
639   /* can only add lineages to existing list */
640   if (orglist == NULL) return;
641 
642   /* now read in lineages */
643   f = OpenSequinDataFile ("lineages.txt");
644 
645   if (f != NULL) 
646   {
647     rbd.fp = f;
648     rbd.current_data = NULL;
649     line = AbstractReadFunction (&rbd);
650     line = AbstractReadFunction (&rbd);
651     while (line != NULL)
652     {
653       ptr = StringChr (line, '\t');
654       if (ptr != NULL) 
655       {
656         *ptr = '\0';
657         if (StrToLong (line, &taxnum)) 
658         {
659           oip = FindByTaxNum (taxnum);
660           if (oip != NULL)
661           {
662             oip->lineage = StringSave (ptr + 1);
663           }
664         }
665       }
666         line = AbstractReadFunction (&rbd);
667     }
668     FileClose (f);
669   }
670 }
671 
672 static CharPtr GetNextToken (CharPtr PNTR pstart)
673 {
674   CharPtr pend;
675   CharPtr newval = NULL;
676   
677   if (pstart == NULL || *pstart == NULL)
678   {
679     return NULL;
680   }
681   
682   pend = StringChr (*pstart, '\t');
683   if (pend != NULL)
684   {
685     *pend = 0;
686   }
687   newval = StringSave (*pstart);
688   if (pend == NULL)
689   {
690     *pstart = NULL;
691   }
692   else
693   {
694     *pstart = pend + 1;
695   }
696   return newval;
697 }
698 
699 static void LoadOrganismList (void)
700 {
701   ReadBufferData    rbd;
702   CharPtr           line;
703   CharPtr           p_start, numval;
704   FILE              *f;
705   OrgInfoPtr        oip;
706 
707   if (orglist != NULL) return;
708   
709   f = OpenSequinDataFile ("taxlist.txt");
710 
711   if (f != NULL) {
712     rbd.fp = f;
713     rbd.current_data = NULL;
714     line = AbstractReadFunction (&rbd);
715     line = AbstractReadFunction (&rbd);
716     while (line != NULL)
717     {
718       oip = (OrgInfoPtr) MemNew (sizeof (OrgInfoData));
719       if (oip != NULL)
720       {
721         p_start = line;
722         /* read in tax name */
723         oip->taxname = GetNextToken (&p_start);
724         
725         /* read in common name */
726         oip->common = GetNextToken (&p_start);
727          
728         /* read in nuclear genetic code */
729         numval = GetNextToken (&p_start);
730         if (numval != NULL)
731         {
732           StrToLong (numval, &(oip->ngcode));
733           numval = MemFree (numval);
734         }
735         /* read in mitochondrial genetic code */
736         numval = GetNextToken (&p_start);
737         if (numval != NULL)
738         {
739           StrToLong (numval, &(oip->mgcode));
740           numval = MemFree (numval);
741         }
742         
743         /* read in div */
744         oip->div = GetNextToken (&p_start);
745         
746         /* read in taxnum */
747         numval = GetNextToken (&p_start);
748         if (numval != NULL)
749         {
750           StrToLong (numval, &(oip->taxnum));
751           numval = MemFree (numval);
752         }
753                 
754         ValNodeAddPointer (&orglist, 0, oip);
755       }
756       line = MemFree (line);
757         line = AbstractReadFunction (&rbd);
758     }
759     FileClose (f);
760   }
761   AddLineagesToOrganismList ();
762 }
763 
764 /* This section of code is used for determining genetic codes based on
765  * FASTA-defline values.
766  */
767 #define USE_NUCLEAR_GENETIC_CODE       1
768 #define USE_MITOCHONDRIAL_GENETIC_CODE 2
769 #define USE_OTHER_GENETIC_CODE         3
770 
771 static Int4 UseGeneticCodeForLocation (CharPtr location)
772 {
773   if (StringHasNoText (location))
774   {
775     return USE_NUCLEAR_GENETIC_CODE;
776   }
777   else if (StringICmp (location, "Mitochondrion") == 0
778            || StringICmp (location, "Kinetoplast") == 0
779            || StringICmp (location, "Hydrogenosome") == 0)
780   {
781     return USE_MITOCHONDRIAL_GENETIC_CODE;
782   }
783   else if (StringICmp (location, "Chloroplast") == 0
784            || StringICmp (location, "Chromoplast") == 0
785            || StringICmp (location, "plastid") == 0
786            || StringICmp (location, "cyanelle") == 0
787            || StringICmp (location, "apicoplast") == 0
788            || StringICmp (location, "leucoplast") == 0
789            || StringICmp (location, "proplastid") == 0)
790   {
791     return USE_OTHER_GENETIC_CODE;
792   }
793   else
794   {
795     return USE_NUCLEAR_GENETIC_CODE;
796   }
797 }
798 
799 
800 static Int4 GetGeneticCodeForTaxNameAndLocation (CharPtr taxname, CharPtr location)
801 {
802   ValNodePtr vnp;
803   OrgInfoPtr oip;
804   Int4       use_code;
805   
806   use_code = UseGeneticCodeForLocation (location);
807   if (use_code == USE_OTHER_GENETIC_CODE)
808   {
809     return 11;
810   }
811   else if (StringHasNoText (taxname))
812   {
813     return -1;
814   }
815   
816   for (vnp = orglist; vnp != NULL; vnp = vnp->next)
817   {
818     if (vnp->data.ptrvalue == NULL)
819     {
820       continue;
821     }
822     oip = (OrgInfoPtr) vnp->data.ptrvalue;
823     if (StringICmp (oip->taxname, taxname) == 0)
824     {
825       if (use_code == USE_NUCLEAR_GENETIC_CODE)
826       {
827         return oip->ngcode;
828       }
829       else
830       {
831         return oip->mgcode;
832       }
833     }
834   }
835   
836   return -1;
837 }
838 
839 static CharPtr GeneticCodeStringFromIntAndList (Int4 num, ValNodePtr list)
840 {
841   while (list != NULL)
842   {
843     if (list->choice == num)
844     {
845       return list->data.ptrvalue;
846     }
847     list = list->next;
848   }
849   return NULL;
850 }
851 
852 
853 /* these functions deal with commonly asked questions about package types - 
854  * which ones are sets, which ones are single sequences, which ones have
855  * which default molecule types.
856  */
857 static Boolean PackageTypeIsSet (Int2 seqPackage)
858 {
859   if (seqPackage == SEQ_PKG_POPULATION
860       || seqPackage == SEQ_PKG_PHYLOGENETIC 
861       || seqPackage == SEQ_PKG_MUTATION
862       || seqPackage == SEQ_PKG_ENVIRONMENT
863       || seqPackage == SEQ_PKG_GENBANK)
864   {
865     return TRUE;
866   }
867   else
868   {
869     return FALSE;
870   }
871   
872 }
873 
874 static Boolean PackageTypeIsSingle (Int2 seqPackage)
875 {
876   if (seqPackage == SEQ_PKG_SINGLE
877       || seqPackage == SEQ_PKG_SEGMENTED
878       || seqPackage == SEQ_PKG_GAPPED)
879   {
880     return TRUE;
881   }
882   else
883   {
884     return FALSE;
885   }
886 }
887 
888 /* These functions are used to find titles in SeqEntries */
889 static void FindFirstTitle (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
890 
891 {
892   CharPtr PNTR  ttlptr;
893 
894   if (mydata == NULL) return;
895   ttlptr = (CharPtr PNTR) mydata;
896   if (*ttlptr != NULL) return;
897   *ttlptr = SeqEntryGetTitle (sep);
898 }
899 
900 static void FindFirstSeqEntryTitle (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
901 
902 {
903   SeqEntryPtr PNTR  sepptr;
904 
905   if (mydata == NULL) return;
906   sepptr = (SeqEntryPtr PNTR) mydata;
907   if (*sepptr != NULL) return;
908   if (SeqEntryGetSeqDescr (sep, Seq_descr_title, NULL) != NULL) {
909    *sepptr = sep;
910   }
911 }
912 
913 /* These functions are used to change the values of modifiers in definition lines */
914 
915 extern void MakeSearchStringFromAlist (CharPtr str, CharPtr name)
916 
917 {
918   Char     ch;
919   CharPtr  ptr;
920 
921   StringCpy (str, "[");
922   StringCat (str, name);
923   StringCat (str, "=");
924   ptr = str;
925   ch = *ptr;
926   while (*ptr != '\0') {
927     *ptr = TO_LOWER (ch);
928     ptr++;
929     ch = *ptr;
930   }
931 }
932 
933 /* This section of code is used for parsing well-formatted definition lines.
934  */
935 typedef struct modifieralias 
936 {
937   CharPtr alias;
938   CharPtr modifier;
939 } ModifierAlias, PNTR ModifierAliasPtr;
940 
941 static ModifierAlias alias_list [] =
942 {
943   { "org", "organism" },
944   { "mol-type", "moltype" },
945   { "mol_type", "moltype" },
946   { "note", "note-orgmod" },
947   { "comment", "note-orgmod" },
948   { "common-name", "common name"},
949   { "subsource", "note-subsrc" },
950   { "technique", "tech" },
951   { "prot", "protein" },
952   { "prot_desc", "protein_desc" }
953 };
954 
955 static Int4 num_aliases = sizeof (alias_list) / sizeof (ModifierAlias);
956 
957 static CharPtr protein_modifier_names [] = 
958 {
959   "gene",
960   "gene_syn",
961   "protein",
962   "protein_desc",
963   "note",
964   "comment",
965   "orf",
966   "function",
967   "EC_number"
968 };
969 
970 static Int4 num_protein_modifier_names = sizeof (protein_modifier_names) / sizeof (CharPtr);
971 
972 static CharPtr GetCanonicalName (CharPtr mod_name)
973 {
974   Int4 j;
975   Uint1   subtype;
976   
977   if (StringHasNoText (mod_name))
978   {
979     return StringSave ("");
980   }
981   
982   for (j = 0; j < num_protein_modifier_names; j++) {
983     if (StringsAreEquivalent (mod_name, protein_modifier_names[j])) {
984       return StringSave (protein_modifier_names[j]);
985     }
986   }
987 
988   subtype = EquivalentOrgMod (mod_name);
989   if (subtype != 0) {
990     return StringSave (GetOrgModQualName (subtype));
991   }
992 
993   subtype = EquivalentSubSource (mod_name);
994   if (subtype != 0) {
995     return StringSave (GetSubsourceQualName (subtype));
996   }
997  
998   for (j = 0; j < num_aliases; j++)
999   {
1000     if (StringsAreEquivalent (alias_list [j].alias, mod_name))
1001     {
1002       return StringSave (alias_list [j].modifier);
1003     }
1004   }
1005   return StringSave (mod_name);
1006 }
1007 
1008 typedef enum {
1009     eModifierType_SourceQual = 0,
1010     eModifierType_Organism,
1011     eModifierType_Location,
1012     eModifierType_Lineage,
1013     eModifierType_GeneticCode,
1014     eModifierType_GeneticCodeComment,
1015     eModifierType_NucGeneticCode,
1016     eModifierType_MitoGeneticCode,
1017     eModifierType_MolType,
1018     eModifierType_Molecule,
1019     eModifierType_Origin,
1020     eModifierType_Topology,
1021     eModifierType_CommonName,
1022     eModifierType_Technique,
1023     eModifierType_Protein
1024 } EModifierType;
1025 
1026 typedef struct modifierinfo 
1027 {
1028   CharPtr       name;
1029   Uint1         subtype;
1030   CharPtr       value;
1031   EModifierType modtype;
1032 } ModifierInfoData, PNTR ModifierInfoPtr;
1033 
1034 static ModifierInfoPtr ModifierInfoNew (void)
1035 {
1036   ModifierInfoPtr mip;
1037   mip = (ModifierInfoPtr) MemNew (sizeof (ModifierInfoData));
1038   if (mip == NULL) return NULL;
1039   mip->name = NULL;
1040   mip->value = NULL;
1041   mip->modtype = eModifierType_SourceQual;
1042   return mip;
1043 }
1044 
1045 static ModifierInfoPtr ModifierInfoFree (ModifierInfoPtr mip)
1046 {
1047   if (mip == NULL) return NULL;
1048   mip->name = MemFree (mip->name);
1049   mip->value = MemFree (mip->value);
1050   mip = MemFree (mip);
1051   return mip;
1052 }
1053 
1054 static ValNodePtr ModifierInfoListFree (ValNodePtr list)
1055 {
1056   if (list == NULL) return NULL;
1057   ModifierInfoListFree (list->next);
1058   list->next = NULL;
1059   list->data.ptrvalue = ModifierInfoFree (list->data.ptrvalue);
1060   ValNodeFree (list);
1061   return NULL;
1062 }
1063 
1064 static EModifierType GetModifierType (CharPtr mod_name)
1065 {
1066   Int4 i;
1067   CharPtr canonical_name;
1068   EModifierType returntype;
1069   
1070   canonical_name = GetCanonicalName (mod_name);
1071   
1072   if (StringHasNoText (canonical_name))
1073   {
1074     returntype = eModifierType_SourceQual;
1075   }
1076   else if (StringICmp (canonical_name, "organism") == 0
1077            || StringICmp (canonical_name, "org") == 0)
1078   {
1079         returntype = eModifierType_Organism;
1080   }
1081   else if (StringICmp (canonical_name, "location") == 0)
1082   {
1083     returntype = eModifierType_Location;
1084   }
1085   else if (StringICmp (canonical_name, "lineage") == 0)
1086   {
1087     returntype = eModifierType_Lineage;
1088   }
1089   else if (StringICmp (canonical_name, "gcode") == 0)
1090   {
1091     returntype = eModifierType_NucGeneticCode;
1092   }
1093   else if (StringICmp (canonical_name, "mgcode") == 0)
1094   {
1095     returntype = eModifierType_MitoGeneticCode;
1096   }
1097   else if (StringICmp (canonical_name, "genetic_code") == 0)
1098   {
1099     returntype = eModifierType_GeneticCode;
1100   }
1101   else if (StringICmp (canonical_name, "gencode_comment") == 0)
1102   {
1103     returntype = eModifierType_GeneticCodeComment;
1104   }
1105   else if (StringICmp (canonical_name, "moltype") == 0)
1106   {
1107     returntype = eModifierType_MolType;
1108   }
1109   else if (StringICmp (canonical_name, "molecule") == 0)
1110   {
1111     returntype = eModifierType_Molecule;
1112   }
1113   else if (StringICmp (canonical_name, "origin") == 0)
1114   {
1115     returntype = eModifierType_Origin;
1116   }
1117   else if (StringICmp (canonical_name, "topology") == 0)
1118   {
1119     returntype = eModifierType_Topology;
1120   }
1121   else if (StringICmp (canonical_name, "common name") == 0)
1122   {
1123     returntype = eModifierType_CommonName;
1124   }
1125   else if (StringICmp (canonical_name, "tech") == 0)
1126   {
1127     returntype = eModifierType_Technique;
1128   }
1129   else
1130   {
1131     for (i = 0; i < num_protein_modifier_names; i++)
1132     {
1133       if (StringICmp (canonical_name, protein_modifier_names[i]) == 0)
1134       {
1135         returntype = eModifierType_Protein;
1136         canonical_name = MemFree (canonical_name);
1137         return returntype;
1138       }
1139     }
1140     returntype = eModifierType_SourceQual;   
1141   }
1142   
1143   canonical_name = MemFree (canonical_name);
1144   return returntype;
1145 }
1146 
1147 static Boolean AllowMultipleValues (CharPtr mod_name)
1148 {
1149   EModifierType mod_type;
1150   Boolean       rval = FALSE;
1151   
1152   mod_type = GetModifierType (mod_name);
1153   switch (mod_type)
1154   {
1155     case eModifierType_SourceQual:
1156       if (! IsNonTextModifier (mod_name))
1157       {
1158         rval = TRUE;
1159       }
1160       break;
1161     case eModifierType_CommonName:
1162       rval = TRUE;
1163       break;
1164     case eModifierType_Organism:
1165       rval = TRUE;
1166       break;
1167     default:
1168       rval = FALSE;
1169       break;
1170   }
1171   return rval;
1172 }
1173 
1174 typedef enum 
1175 {
1176   BRACKET_ERR_NO_ERR = 0,
1177   BRACKET_ERR_MISMATCHED_BRACKETS,
1178   BRACKET_ERR_MISSING_EQUALS,
1179   BRACKET_ERR_MULT_EQUALS,
1180   BRACKET_ERR_NO_MOD_NAME,
1181   BRACKET_ERR_MISMATCHED_QUOTES
1182 } bracketing_err_num;
1183 
1184 static Char ExpectToken (CharPtr cp)
1185 {
1186   CharPtr valstart;
1187   
1188   if (cp == NULL)
1189   {
1190     return 0;
1191   }
1192   else if (*cp == '[')
1193   {
1194     valstart = cp + 1 + StringSpn (cp + 1, " \t");
1195     if (StringLen (valstart) > 3 
1196         && (StringNICmp (valstart, "dna", 3) == 0 
1197             || StringNICmp (valstart, "rna", 3) == 0
1198             || StringNICmp (valstart, "orf", 3) == 0)
1199         && *(valstart + 3 + StringSpn (valstart + 3, " \t")) == ']')
1200     {
1201       return ']';    
1202     }
1203     else
1204     {
1205       return '=';
1206     }
1207   }
1208   else if (*cp == '=')
1209   {
1210     return ']';
1211   }
1212   else if (*cp == ']')
1213   {
1214     return '[';
1215   }
1216   else
1217   {
1218     return 0;
1219   }
1220 }
1221 
1222 /* When we are looking for double-quotation marks to use for delimiting
1223  * sections of a title that should not be parsed or values that may contain
1224  * brackets, equals signs, or other reserved characters, skip over
1225  * quotation marks that are preceded by the escape character (backslash).
1226  * This allows quotation marks to be included in a quoted string.
1227  */
1228 static CharPtr NextUnescapedQuote (CharPtr str)
1229 {
1230   CharPtr cp;
1231   
1232   if (StringHasNoText (str))
1233   {
1234     return NULL;
1235   }
1236   cp = StringChr (str, '"');
1237   if (cp != NULL && cp != str)
1238   {
1239     while (cp != NULL && *(cp - 1) == '\\')
1240     {
1241       cp = StringChr (cp + 1, '"');
1242     }
1243   }
1244   return cp;  
1245 }
1246 
1247 /* This function steps backward from str_end until it has located 
1248  * an unescaped double-quotation mark or it has reached the 
1249  * start of the string (str_start).
1250  */
1251 static CharPtr FindPreviousUnescapedQuote (CharPtr str_start, CharPtr str_end)
1252 {
1253   CharPtr cp;
1254   if (str_start == NULL || str_end == NULL || str_end < str_start)
1255   {
1256     return NULL;
1257   }
1258   
1259   cp = str_end;
1260   while (cp > str_start && (*cp != '"' || *(cp - 1) == '\\'))
1261   {
1262     cp--;
1263   }
1264   if (*cp != '"')
1265   {
1266     cp = NULL;
1267   }
1268   return cp;
1269 }
1270 
1271 
1272 /* This function finds the next bracketing token ([, =, or ]) in
1273  * the string that is not enclosed by unescaped quotation marks.
1274  */
1275 static CharPtr NextBracketToken (CharPtr str)
1276 {
1277   CharPtr next_quote;
1278   CharPtr cp;
1279   
1280   if (StringHasNoText (str))
1281   {
1282     return NULL;
1283   }
1284   
1285   cp = str;
1286   while (*cp != 0)
1287   {
1288     switch (*cp)
1289     {
1290       case '"':
1291         if (cp == str || (*(cp - 1) != '\\'))
1292         {
1293           next_quote = NextUnescapedQuote (cp + 1);
1294           if (next_quote == NULL)
1295           {
1296             return cp;
1297           }
1298           else
1299           {
1300             cp = next_quote + 1;;
1301           }
1302         }
1303         else
1304         {
1305           cp++;
1306         }
1307         break;
1308       case '[':
1309       case ']':
1310       case '=':
1311         return cp;
1312       default:
1313         cp++;
1314     }
1315   }
1316     
1317   return NULL;
1318 }
1319 
1320 static Int4 DetectBadBracketing (CharPtr str)
1321 {
1322   CharPtr cp;
1323   Char    expected_token;
1324   CharPtr last_token = NULL, namestart;
1325   
1326   if (StringHasNoText (str))
1327   {
1328     return BRACKET_ERR_NO_ERR;
1329   }
1330   
1331   expected_token = '[';
1332   cp = NextBracketToken (str);
1333   while (cp != NULL)
1334   {
1335     switch (*cp)
1336     {
1337       case '"':
1338         return BRACKET_ERR_MISMATCHED_QUOTES;
1339         break;
1340       case '[':
1341       case ']':
1342       case '=':
1343         if (expected_token == *cp)
1344         {
1345           if (expected_token == '=' && last_token != NULL)
1346           {
1347             namestart = last_token + 1 + StringSpn (last_token + 1, " \t");
1348             if (namestart == cp)
1349             {
1350               return BRACKET_ERR_NO_MOD_NAME;
1351             }
1352           }
1353           expected_token = ExpectToken (cp);
1354           last_token = cp;
1355         }
1356         else if (expected_token == '=')
1357         {
1358           if (cp - last_token - 1 == StringSpn (last_token + 1, " \t"))
1359           {
1360             return BRACKET_ERR_MISMATCHED_BRACKETS;
1361           }
1362           else
1363           {
1364             return BRACKET_ERR_MISSING_EQUALS;
1365           }
1366         }
1367         else if (*cp == '=')
1368         {
1369           if (expected_token == ']')
1370           {
1371             return BRACKET_ERR_MULT_EQUALS;
1372           }
1373           else
1374           {
1375             return BRACKET_ERR_MISMATCHED_BRACKETS;
1376           }
1377         }
1378         else
1379         {
1380           return BRACKET_ERR_MISMATCHED_BRACKETS;
1381         }
1382         break;
1383     }
1384     cp = NextBracketToken (cp + 1);
1385   }
1386   
1387   if (cp == NULL && expected_token != '[')
1388   {
1389     return BRACKET_ERR_MISMATCHED_BRACKETS;
1390   }
1391   
1392   return BRACKET_ERR_NO_ERR;
1393 }
1394 
1395 static ModifierInfoPtr 
1396 ParseOneBracketedModifier 
1397 (CharPtr      str, 
1398  CharPtr PNTR bracket_start,
1399  CharPtr PNTR bracket_stop)
1400 {
1401   CharPtr         start, stop, eq_loc;
1402   ModifierInfoPtr mip;
1403   Int4            value_len, name_len;
1404   CharPtr         canonical_name;
1405   
1406   start = NextBracketToken (str);
1407   while (start != NULL && *start != '[')
1408   {
1409     start = NextBracketToken (start + 1);
1410   }
1411   if (start == NULL) return NULL;
1412   eq_loc = NextBracketToken (start + 1);
1413   if (eq_loc == NULL) return NULL;
1414   if (*eq_loc == ']')
1415   {
1416     stop = eq_loc;
1417   }
1418   else if (*eq_loc == '=')
1419   {
1420     stop = NextBracketToken (eq_loc + 1);
1421   }
1422   else
1423   {
1424     return NULL;
1425   }
1426   
1427   if (stop == NULL || *stop != ']') return NULL;
1428       
1429   mip = ModifierInfoNew();
1430   if (mip == NULL) return NULL;
1431   
1432   /* copy in modifier name */
1433   name_len = eq_loc - start + 1;
1434   mip->name = (CharPtr) MemNew (name_len * sizeof (Char));
1435   if (mip->name == NULL)
1436   {
1437     mip = ModifierInfoFree (mip);
1438     return NULL;
1439   }
1440   StringNCpy (mip->name, start + 1, name_len - 2);
1441   mip->name [name_len - 1] = 0;
1442   TrimSpacesAroundString (mip->name);
1443   canonical_name = GetCanonicalName (mip->name);
1444   mip->name = MemFree (mip->name);
1445   mip->name = canonical_name;
1446   if (StringICmp (mip->name, "note") == 0)
1447   {
1448     mip->name = MemFree (mip->name);
1449     mip->name = StringSave ("Note-SubSrc");
1450   }
1451 
1452   /* [orf], [rna], and [dna] don't have values */  
1453   if (stop > eq_loc)
1454   {
1455     value_len = stop - eq_loc + 1;
1456     mip->value = (CharPtr) MemNew (value_len * sizeof (Char));
1457     if (mip->value == NULL)
1458     {
1459       mip = ModifierInfoFree (mip);
1460       return NULL;
1461     }
1462   
1463     StringNCpy (mip->value, eq_loc + 1, value_len - 2);
1464     mip->value [value_len - 1] = 0;
1465     TrimSpacesAroundString (mip->value);
1466   }
1467   
1468   mip->modtype = GetModifierType (mip->name);
1469   if (mip->modtype == eModifierType_SourceQual)
1470   {
1471     mip->subtype = FindTypeForModNameText (mip->name);
1472   }
1473   else
1474   {
1475     mip->subtype = 0;
1476   }
1477   
1478   if (bracket_start != NULL)
1479   {
1480     *bracket_start = start;
1481   }
1482   
1483   if (bracket_stop != NULL)
1484   {
1485     *bracket_stop = stop;
1486   }
1487   
1488   return mip;
1489 }
1490 
1491 static ValNodePtr ParseAllBracketedModifiers (CharPtr str)
1492 {
1493   CharPtr         stop, cp;
1494   ValNodePtr      list = NULL;
1495   ModifierInfoPtr mip;
1496   
1497   cp = str;
1498   mip = ParseOneBracketedModifier (cp, NULL, &stop);
1499   while (mip != NULL && stop != NULL)
1500   {
1501     ValNodeAddPointer (&list, 0, mip);
1502     cp = stop + 1;
1503     mip = ParseOneBracketedModifier (cp, NULL, &stop);  
1504   }
1505   return list;
1506 }
1507 
1508 static Boolean IsValueInEnumAssoc (CharPtr value, EnumFieldAssocPtr eap)
1509 {
1510   while (eap != NULL && eap->name != NULL) 
1511   {
1512     if (StringICmp (eap->name, value) == 0)
1513     {
1514       return TRUE;
1515     }
1516     eap++;
1517   }
1518   return FALSE;
1519 }
1520 
1521 static Int4 GeneticCodeFromStringAndList (CharPtr str, ValNodePtr list)
1522 {
1523   while (list != NULL)
1524   {
1525     if (StringICmp (str, list->data.ptrvalue) == 0)
1526     {
1527       return list->choice;
1528     }
1529     list = list->next;
1530   }
1531   return 0;
1532 }
1533 
1534 static Int4 GeneticCodeFromString (CharPtr str)
1535 {
1536   ValNodePtr gencodelist;
1537   Int4       gcode = 0;
1538   
1539   if (StringHasNoText (str))
1540   {
1541     gcode = 0;
1542   }
1543   else if (isdigit (str[0]))
1544   {
1545     gcode = atoi (str);
1546   }
1547   else
1548   {
1549     gencodelist = GetGeneticCodeValNodeList ();
1550     gcode = GeneticCodeFromStringAndList (str, gencodelist);
1551     gencodelist = ValNodeFreeData (gencodelist);
1552   }
1553   return gcode;
1554 }
1555 
1556 static Int4 MolTypeFromString (CharPtr str)
1557 {
1558   EnumFieldAssocPtr  eap;
1559 
1560   if (StringICmp (str, "dna") == 0)
1561   {
1562     return 253;
1563   }
1564   else if (StringICmp (str, "rna") == 0)
1565   {
1566     return 254;
1567   }
1568   else if (StringICmp (str, "genomic") == 0)
1569   {
1570     return 253;
1571   }
1572   for (eap = biomol_nucGen_alist; eap != NULL && eap->name != NULL; eap++)
1573   {
1574     if (StringsAreEquivalent (eap->name, str))
1575     {
1576       return eap->value;
1577     }
1578   }
1579   for (eap = biomol_nucX_alist; eap != NULL && eap->name != NULL; eap++)
1580   {
1581     if (StringsAreEquivalent (eap->name, str))
1582     {
1583       return eap->value;
1584     }
1585     else if (eap->name [0] == 'm'
1586              && StringICmp (eap->name, "mRNA [cDNA]") == 0
1587              && StringICmp (str, "mRNA") == 0)
1588     {
1589       return eap->value;
1590     }
1591   }
1592   return 0;
1593 }
1594 
1595 
1596 /* This function looks at a parsed modifier structure to determine whether the
1597  * value is acceptable for this modifier type.
1598  */
1599 static Boolean ModifierHasInvalidValue (ModifierInfoPtr mip)
1600 {
1601   Boolean rval = FALSE;
1602   
1603   if (mip != NULL
1604       && ((mip->modtype == eModifierType_Location
1605                   && !IsValueInEnumAssoc (mip->value, biosource_genome_simple_alist))
1606             || (mip->modtype == eModifierType_Origin
1607                   && !IsValueInEnumAssoc (mip->value, biosource_origin_alist))      
1608             || (mip->modtype == eModifierType_Topology
1609                   && !IsValueInEnumAssoc (mip->value, topology_nuc_alist))
1610             || (mip->modtype == eModifierType_Molecule
1611                   && !IsValueInEnumAssoc (mip->value, molecule_alist))
1612             || ((mip->modtype == eModifierType_GeneticCode
1613                       || mip->modtype == eModifierType_NucGeneticCode
1614                       || mip->modtype == eModifierType_MitoGeneticCode)
1615                      && GeneticCodeFromString (mip->value) == 0)
1616             || (mip->modtype == eModifierType_MolType
1617                      && MolTypeFromString (mip->value) == 0)
1618             || (mip->modtype == eModifierType_SourceQual
1619                      && IsNonTextModifier (mip->name)
1620                      && !StringHasNoText (mip->value)
1621                  && StringICmp (mip->value, "TRUE") != 0
1622                  && StringICmp (mip->value, "FALSE") != 0)))
1623   {
1624     rval = TRUE;
1625   }
1626 
1627   return rval;
1628 }
1629 
1630 /* This section contains functions for finding, changing, and removing 
1631  * bracketed value pairs in definition lines.
1632  * These functions include:
1633  *
1634  * FindValuePairInDefLine - returns pointer to position in title where 
1635  *                          the first bracketed pair with the specified 
1636  *                          modifier name (or one of its aliases) occurs.
1637  *                          Useful for non-text modifiers, which do not
1638  *                          have values.
1639  *
1640  * FindValueFromPairInDefline - returns value from the first bracketed
1641  *                              pair in the title with the specified
1642  *                              modifier name (or one of its aliases).
1643  *
1644  * RemoveValueFromDefline - removes the first bracketed pair in the title
1645  *                          with the specified modifier name (or one of its aliases)
1646  *
1647  * ReplaceValueInThisValuePair - replaces the value in the specified value pair.
1648  *                               if new value is empty, pair is removed.
1649  *
1650  * ReplaceValueInOneDefLine - finds the first bracketed pair in the title
1651  *                            with the specified modifier name (or one of its aliases).
1652  *                            If a pair is found, the value in that pair is replaced
1653  *                            with the new value; otherwise a new pair is added to
1654  *                            the title.  
1655  *
1656  * ReplaceOneModifierValue - finds all bracketed pairs in a title with the specified
1657  *                           modifier name or one of its aliases and the specified value
1658  *                           and replaces that value with the new value (or removes the
1659  *                           pair, if the new value is empty.
1660  *
1661  * RemoveAllDuplicatePairsFromOneTitle - removes all bracketed pairs that are duplicates
1662  *                                       in name and value of another pair already in
1663  *                                       the title.
1664  *
1665  * RemoveMeaninglessEmptyPairsFromOneTitle - removes bracketed pairs without values
1666  *                                           that are not non-text modifiers
1667  *
1668  * StripAllInstancesOfModNameFromTitle - removes all mentions of specified modifier
1669  *                                       name from title
1670  *
1671  */
1672 
1673 static CharPtr FindValuePairInDefLine (CharPtr mod_name, CharPtr def_line, CharPtr PNTR valstop)
1674 {
1675   CharPtr         cp, start, stop;
1676   ModifierInfoPtr mip;
1677   CharPtr         canonical_name;
1678   
1679   if (mod_name == NULL || def_line == NULL)
1680   {
1681     return NULL;
1682   }
1683   
1684   cp = NextBracketToken (def_line);
1685   if (cp == NULL)
1686   {
1687     return NULL;
1688   }
1689   
1690   canonical_name = GetCanonicalName (mod_name);
1691   
1692   mip = ParseOneBracketedModifier (cp, &start, &stop);
1693   while (mip != NULL && start != NULL && stop != NULL 
1694          && StringICmp (mip->name, canonical_name) != 0)
1695   {
1696     cp = NextBracketToken (stop + 1);
1697     mip = ModifierInfoFree (mip);
1698     mip = ParseOneBracketedModifier (cp, &start, &stop);
1699   }
1700   
1701   if (mip != NULL && StringICmp (mip->name, canonical_name) == 0)
1702   {
1703     mip = ModifierInfoFree (mip);
1704     if (valstop != NULL)
1705     {
1706       *valstop = stop;
1707     }
1708     canonical_name = MemFree (canonical_name);
1709     return start;
1710   }
1711   else
1712   {
1713     mip = ModifierInfoFree (mip);
1714     canonical_name = MemFree (canonical_name);
1715     return NULL;
1716   }
1717 }
1718 
1719 static CharPtr FindNthValuePairInDefLine (CharPtr title, CharPtr val_name, Int4 val_num, CharPtr PNTR p_val_end)
1720 {
1721   CharPtr val_loc, val_end = NULL;
1722   Int4    title_val_num;
1723   
1724   if (StringHasNoText (val_name))
1725   {
1726     return NULL;
1727   }
1728   
1729   val_loc = FindValuePairInDefLine (val_name, title, &val_end);
1730   title_val_num = 0;
1731   while (val_loc != NULL && val_end != NULL && title_val_num != val_num)
1732   {
1733     val_loc = FindValuePairInDefLine (val_name, val_end + 1, &val_end);
1734     title_val_num++;
1735   }
1736   if (p_val_end != NULL)
1737   {
1738     *p_val_end = val_end;
1739   }
1740   return val_loc;
1741 }
1742 
1743 static CharPtr FindValueFromPairInDefline (CharPtr mod_name, CharPtr def_line)
1744 {
1745   CharPtr bracket_start, eq_loc, bracket_end;
1746   CharPtr new_val = NULL;
1747   Int4 new_val_len;
1748   
1749   bracket_start = FindValuePairInDefLine (mod_name, def_line, &bracket_end);
1750   if (bracket_start == NULL || bracket_end == NULL)
1751   {
1752     return NULL;
1753   }
1754   
1755   eq_loc = NextBracketToken (bracket_start + 1);
1756   if (eq_loc == NULL || *eq_loc != '=')
1757   {
1758     return NULL;
1759   }
1760     
1761   new_val_len = bracket_end - eq_loc;
1762   new_val = (CharPtr) MemNew (new_val_len * sizeof (Char));
1763   if (new_val != NULL)
1764   {
1765     StringNCpy (new_val, eq_loc + 1, new_val_len - 1);
1766     new_val [new_val_len - 1] = 0;
1767   }
1768   TrimSpacesAroundString (new_val);
1769   return new_val;
1770 }
1771 
1772 static CharPtr FindValueFromPairInDeflineBeforeCharPtr (CharPtr mod_name, CharPtr def_line, CharPtr cp)
1773 {
1774   CharPtr bracket_start, bracket_end;
1775   
1776   bracket_start = FindValuePairInDefLine (mod_name, def_line, &bracket_end);
1777   if (bracket_start == NULL || (cp != NULL && bracket_start > cp))
1778   {
1779     return NULL;
1780   }
1781   else
1782   {
1783     return FindValueFromPairInDefline (mod_name, bracket_start);
1784   }
1785 }
1786 
1787 static void RemoveValuePairFromDefline (CharPtr pair_start, CharPtr pair_end, CharPtr defline)
1788 {
1789   CharPtr src, dst;
1790 
1791   if (pair_start == NULL || pair_end == NULL || defline == NULL
1792       || pair_end <= pair_start)
1793   {
1794     return;
1795   }
1796   
1797   dst = pair_start;
1798   src = pair_end;
1799   while (isspace (*src))
1800   {
1801     src++;
1802   }
1803   
1804   while (*src != 0)
1805   {
1806     *dst = *src;
1807     dst++;
1808     src++;
1809   }
1810   *dst = 0;
1811 }
1812 
1813 static void RemoveValueFromDefline (CharPtr mod_name, CharPtr def_line)
1814 {
1815   CharPtr bracket_start, bracket_end;
1816   
1817   bracket_start = FindValuePairInDefLine (mod_name, def_line, &bracket_end);
1818   if (bracket_start == NULL || bracket_end == NULL)
1819   {
1820     return;
1821   }
1822   
1823   RemoveValuePairFromDefline (bracket_start, bracket_end + 1, def_line);
1824 }
1825 
1826 static CharPtr AddQuotesToValueWithBrackets (CharPtr orig_value)
1827 {
1828   CharPtr first_bracket, first_quote;
1829   CharPtr cp, new_value = NULL, tmp_value;
1830   Char    bracket_buf [2];
1831   Int4    offset;
1832   
1833   if (orig_value == NULL)
1834   {
1835     return NULL;
1836   }
1837   else if (StringHasNoText (orig_value))
1838   {
1839     return StringSave (orig_value);
1840   }
1841   
1842   new_value = StringSave (orig_value);
1843   
1844   first_bracket = StringChr (new_value, '[');
1845   if (first_bracket == NULL)
1846   {
1847     first_bracket = StringChr (new_value, ']');
1848   }
1849   
1850   first_quote = NextUnescapedQuote (new_value);
1851   
1852   if (first_bracket == NULL && first_quote == NULL)
1853   {
1854     return new_value;
1855   }
1856   else if (first_bracket != NULL && first_quote == NULL)
1857   {
1858     tmp_value = (CharPtr) MemNew ((StringLen (new_value) + 3) * sizeof (Char));
1859     if (tmp_value == NULL)
1860     {
1861       new_value = MemFree (new_value);
1862       return NULL;
1863     }
1864     StringCat (tmp_value, "\"");
1865     StringCat (tmp_value, new_value);
1866     StringCat (tmp_value, "\"");
1867     new_value = MemFree (new_value);
1868     new_value = tmp_value;
1869     return new_value;
1870   }
1871   
1872   cp = orig_value;
1873   
1874   bracket_buf [0] = 0;
1875   bracket_buf [1] = 0;
1876   
1877   while (*cp != 0)
1878   {
1879     if (*cp == '"' && (cp == orig_value || *(cp - 1) != '\\'))
1880     {
1881       cp = NextUnescapedQuote (cp + 1);
1882       if (cp == NULL)
1883       {
1884         tmp_value = (CharPtr) MemNew ((StringLen (new_value) + 3) * sizeof (Char));
1885         if (tmp_value == NULL)
1886         {
1887           new_value = MemFree (new_value);
1888           return NULL;
1889         }
1890         StringCpy (tmp_value, new_value);
1891         if (new_value [StringLen (new_value) - 1] == '\\')
1892         {
1893           StringCat (tmp_value, " ");
1894         }
1895         StringCat (tmp_value, "\"");
1896         return tmp_value;
1897       }
1898       else
1899       {
1900         cp++;
1901       }
1902     }
1903     else if (*cp == '[' || *cp == ']')
1904     {
1905       tmp_value = (CharPtr) MemNew ((StringLen (new_value) + 3) * sizeof (Char));
1906       if (tmp_value == NULL)
1907       {
1908         new_value = MemFree (new_value);
1909         return new_value;
1910       }
1911       offset = cp - new_value;
1912       StringNCpy (tmp_value, new_value, offset);
1913       StringCat (tmp_value, "\"");
1914       bracket_buf [0] = *cp;
1915       StringCat (tmp_value, bracket_buf);
1916       StringCat (tmp_value, "\"");
1917       StringCat (tmp_value, cp + 1);
1918       new_value = MemFree (new_value);
1919       new_value = tmp_value;
1920       cp = new_value + offset + 3;      
1921     }
1922     else
1923     {
1924       cp++;
1925     }
1926   }
1927   
1928   return new_value;
1929 }
1930 
1931 static CharPtr
1932 ReplaceValueInThisValuePair 
1933 (CharPtr orig_defline,
1934  CharPtr value_loc, 
1935  CharPtr value_name,
1936  CharPtr end_loc, 
1937  CharPtr new_value)
1938 {
1939   CharPtr new_title;
1940   Int4    new_title_len = 0;
1941   Boolean is_nontext;
1942   CharPtr tmp_name;
1943   CharPtr fixed_value;
1944 
1945   if (StringHasNoText (orig_defline) || value_loc == NULL || end_loc == NULL
1946       || *value_loc != '[' || *end_loc != ']')
1947   {
1948     return orig_defline;
1949   }
1950 
1951   fixed_value = AddQuotesToValueWithBrackets (new_value);
1952   
1953   if (StringHasNoText (fixed_value))
1954   {
1955     RemoveValuePairFromDefline (value_loc, end_loc, orig_defline);
1956   }
1957   else
1958   {
1959     /* keep part before pair and after pair, insert new value in position */
1960     new_title_len = StringLen (orig_defline)
1961                                + StringLen (value_name)
1962                                + StringLen (fixed_value)
1963                                + 5;
1964     new_title = MemNew (new_title_len * sizeof (Char));
1965     if (new_title != NULL)
1966     {
1967       if (value_loc > orig_defline)
1968       {
1969         StringNCpy (new_title, orig_defline, value_loc - orig_defline);
1970       }
1971       StringCat (new_title, "[");
1972       tmp_name = StringSave (value_name);
1973       tmp_name [0] = TO_LOWER (tmp_name [0]);
1974       StringCat (new_title, tmp_name);
1975       is_nontext = IsNonTextModifier (tmp_name);
1976       tmp_name = MemFree (tmp_name);
1977       StringCat (new_title, "=");
1978       if (!is_nontext)
1979       {
1980         StringCat (new_title, fixed_value);
1981       }
1982       StringCat (new_title, "]");
1983       if (end_loc != NULL && *end_loc != 0)
1984       {
1985         if (*end_loc == ']')
1986         {
1987           StringCat (new_title, end_loc + 1);
1988         }
1989         else
1990         {
1991           StringCat (new_title, end_loc);
1992         }
1993       }
1994       orig_defline = MemFree (orig_defline);
1995       orig_defline = new_title;
1996     }
1997   }  
1998   TrimSpacesAroundString (orig_defline);
1999   
2000   fixed_value = MemFree (fixed_value);
2001   
2002   return orig_defline;
2003 }
2004 
2005 static CharPtr InsertStringAtOffset (CharPtr old_string, CharPtr new_string, Int4 offset)
2006 {
2007   Int4    new_len;
2008   CharPtr new_str = NULL;
2009   
2010   if (old_string == NULL)
2011   {
2012     new_str = StringSave (new_string);
2013   }
2014   else if (new_string == NULL)
2015   {
2016     new_str =  StringSave (old_string);
2017   }
2018   else
2019   {
2020     new_len = StringLen (old_string) + StringLen (new_string) + 1;
2021     new_str = (CharPtr) MemNew (new_len * sizeof (Char));
2022     if (new_str != NULL)
2023     {
2024       StringNCpy (new_str, old_string, offset);
2025       StringCat (new_str, new_string);
2026       if ((Uint4)offset < StringLen (old_string))
2027       {
2028         StringCat (new_str, old_string + offset);
2029       }
2030     }
2031   }
2032   return new_str;
2033 }
2034 
2035 static CharPtr 
2036 InsertValuePairAtOffset 
2037 (CharPtr orig_defline, 
2038  CharPtr value_name, 
2039  CharPtr value_str,
2040  Int4    offset)
2041 {
2042   CharPtr pair_string, fixed_value;
2043   
2044   if (StringHasNoText (value_name) || offset < 0)
2045   {
2046     return orig_defline;
2047   }
2048   
2049   fixed_value = AddQuotesToValueWithBrackets (value_str);
2050   
2051   pair_string = (CharPtr) MemNew ((StringLen (value_name) + StringLen (fixed_value) + 6) * sizeof (Char));
2052   if (pair_string != NULL)
2053   {
2054     if (IsNonTextModifier (value_name))
2055     {
2056       sprintf (pair_string, "[%s=]", value_name);
2057     }
2058     else
2059     {
2060       sprintf (pair_string, "[%s=%s]", value_name, fixed_value);
2061     }
2062     orig_defline = InsertStringAtOffset (orig_defline, pair_string, offset);
2063     pair_string = MemFree (pair_string);
2064   }
2065   fixed_value = MemFree (fixed_value);
2066   return orig_defline;
2067 }
2068 
2069 
2070 static CharPtr
2071 ReplaceValueInOneDefLineForOrganism
2072 (CharPtr orig_defline,
2073  CharPtr value_name, 
2074  CharPtr new_value,
2075  CharPtr organism)
2076 { 
2077   CharPtr value_loc = NULL, end_loc = NULL;
2078   CharPtr fixed_value;
2079   CharPtr next_org_loc = NULL, org_stop = NULL, first_org_stop = NULL;
2080   CharPtr first_organism;
2081   
2082   if (StringHasNoText (value_name))
2083   {
2084     return orig_defline;
2085   }
2086   
2087   /* if we want to add a value to a specific organism, we need to make sure
2088    * that we insert or replace a value after that organism name but before
2089    * the next organism name.
2090    */
2091    
2092   if (organism != NULL)
2093   {
2094     if (organism < orig_defline || organism - orig_defline > (Int4) StringLen (orig_defline))
2095     {
2096       organism = NULL;
2097     }
2098   }
2099   
2100   if (organism != NULL)
2101   {
2102     if (organism != FindValuePairInDefLine ("organism", organism, &org_stop))
2103     {
2104       return orig_defline;
2105     }
2106   }
2107   
2108   first_organism = FindValuePairInDefLine ("organism", orig_defline, &first_org_stop);
2109 
2110   
2111   if (organism == NULL)
2112   {
2113     organism = first_organism;
2114     org_stop = first_org_stop;
2115   }
2116   
2117   if (org_stop != NULL)
2118   {
2119     next_org_loc = FindValuePairInDefLine ("organism", org_stop + 1, NULL);
2120   }
2121   
2122   fixed_value = AddQuotesToValueWithBrackets (new_value);
2123   
2124   /* if this is the first organism, or if we have no organism, start looking for
2125    * a value to replace at the beginning of the line.
2126    */
2127   if (organism == NULL || organism == first_organism)
2128   {
2129     value_loc = FindValuePairInDefLine (value_name, orig_defline, &end_loc);
2130   }
2131   else
2132   {     
2133     value_loc = FindValuePairInDefLine (value_name, organism, &end_loc);
2134   }
2135   
2136   if (next_org_loc != NULL && value_loc > next_org_loc)
2137   {
2138     value_loc = NULL;
2139   }
2140   
2141   if (StringHasNoText (fixed_value))
2142   {
2143     if (value_loc == NULL)
2144     {
2145       /* old line had no value, no new value provided, no change */
2146     }
2147     else
2148     {
2149       RemoveValuePairFromDefline (value_loc, end_loc, orig_defline);
2150     }
2151   }
2152   else
2153   {
2154     if (value_loc == NULL)
2155     {
2156       /* add new value just before next organism */
2157       if (next_org_loc == NULL)
2158       {
2159         orig_defline = InsertValuePairAtOffset (orig_defline, value_name, new_value,
2160                                                 StringLen (orig_defline));
2161       }
2162       else
2163       {
2164         orig_defline = InsertValuePairAtOffset (orig_defline, value_name, new_value,
2165                                                 next_org_loc - orig_defline);
2166       }
2167     }
2168     else
2169     {
2170       /* replace this value */
2171       orig_defline = ReplaceValueInThisValuePair (orig_defline, value_loc, value_name,
2172                                                   end_loc, new_value);
2173     }
2174   }  
2175   TrimSpacesAroundString (orig_defline);
2176   
2177   fixed_value = MemFree (fixed_value);
2178   
2179   return orig_defline;
2180 }
2181 
2182 static CharPtr 
2183 ReplaceValueInOneDefLine 
2184 (CharPtr orig_defline,
2185  CharPtr value_name, 
2186  CharPtr new_value)
2187 {
2188   CharPtr value_loc = NULL, end_loc = NULL;
2189   
2190   if (StringHasNoText (value_name))
2191   {
2192     return orig_defline;
2193   }
2194   
2195   value_loc = FindValuePairInDefLine (value_name, orig_defline, &end_loc);
2196 
2197   if (value_loc == NULL)
2198   {
2199     if (StringHasNoText (new_value))
2200     {
2201       /* old line had no value, no new value provided, no change */    
2202       return orig_defline;
2203     }
2204     else
2205     {
2206       /* make sure value is added for first organism */
2207       orig_defline = ReplaceValueInOneDefLineForOrganism (orig_defline, value_name,
2208                                                           new_value, NULL);        
2209     }
2210   }
2211   else
2212   {
2213     orig_defline = ReplaceValueInThisValuePair (orig_defline, value_loc, value_name, end_loc, new_value);
2214   }  
2215   
2216   return orig_defline;
2217 }
2218 
2219 static CharPtr 
2220 ReplaceOneModifierValue 
2221 (CharPtr title,
2222  CharPtr orig_name, 
2223  CharPtr orig_value,
2224  CharPtr repl_value,
2225  Boolean is_nontext,
2226  Boolean copy_to_note)
2227 {
2228   CharPtr bracket_loc, eq_loc, end_bracket_loc, new_title;
2229   Int4    new_title_len;
2230   CharPtr orig_note, new_note;
2231   Boolean any_replaced = FALSE;
2232   
2233   if (StringHasNoText (title)
2234       || StringHasNoText (orig_name))
2235   {
2236     return title;
2237   }
2238   
2239   bracket_loc = FindValuePairInDefLine (orig_name, title, &end_bracket_loc);
2240   while (bracket_loc != NULL && end_bracket_loc != NULL)
2241   {  
2242     eq_loc = NextBracketToken (bracket_loc + 1);
2243     if (eq_loc == NULL || *eq_loc != '=')
2244     {
2245       return title;
2246     }
2247     if ((StringNCmp (orig_value, eq_loc + 1, StringLen (orig_value)) == 0
2248         && StringLen (orig_value) == end_bracket_loc - eq_loc - 1)
2249         || (StringHasNoText (orig_value) 
2250             && StringSpn (eq_loc + 1, " \t") == end_bracket_loc - eq_loc - 1))
2251     {
2252       new_title_len = StringLen (title) + StringLen (repl_value) - StringLen (orig_value) + 1;
2253       new_title = (CharPtr) MemNew (new_title_len * sizeof (Char));
2254       if (new_title == NULL)
2255       {
2256         return title;
2257       }
2258       if (is_nontext)
2259       {
2260         if (StringHasNoText (repl_value))
2261         {
2262           StringNCpy (new_title, title, bracket_loc - title);
2263           StringCat (new_title, end_bracket_loc + 1 + StringSpn (end_bracket_loc, " "));
2264         }
2265         else
2266         {
2267           StringNCpy (new_title, title, eq_loc - title + 1);
2268           StringCat (new_title, end_bracket_loc);
2269         }
2270       }
2271       else if (StringHasNoText (repl_value))
2272       {
2273         /* remove pair completely */
2274         StringNCpy (new_title, title, bracket_loc - title);
2275         StringCat (new_title, end_bracket_loc + 1);
2276       }
2277       else
2278       {
2279         StringNCpy (new_title, title, eq_loc - title + 1);
2280         StringCat (new_title, repl_value);
2281         StringCat (new_title, end_bracket_loc);        
2282       }
2283 
2284       title = MemFree (title);
2285       title = new_title;
2286       any_replaced = TRUE;
2287       bracket_loc = FindValuePairInDefLine (orig_name, title, &end_bracket_loc);
2288     }
2289     else
2290     {
2291       bracket_loc = FindValuePairInDefLine (orig_name, end_bracket_loc, &end_bracket_loc);
2292     }
2293   }
2294   
2295   if (any_replaced && copy_to_note && !StringHasNoText (repl_value) && !StringHasNoText (orig_value))
2296   {
2297     orig_note = FindValueFromPairInDefline ("note", title);
2298     if (StringHasNoText (orig_note))
2299     {
2300       new_note = (CharPtr) MemNew ((StringLen (orig_name) 
2301                                     + StringLen (orig_value) + 8) * sizeof (Char));
2302       if (new_note != NULL)
2303       {
2304         sprintf (new_note, "%s was %s", orig_name, orig_value);
2305       }
2306     }
2307     else
2308     {
2309       new_note = (CharPtr) MemNew ((StringLen (orig_note)
2310                                     + StringLen (orig_name) 
2311                                     + StringLen (orig_value) + 8) * sizeof (Char));
2312       if (new_note != NULL)
2313       {
2314         sprintf (new_note, "%s; %s was %s", orig_note, orig_name, orig_value);
2315       }
2316     }
2317 
2318     if (new_note != NULL)
2319     {
2320       title = ReplaceValueInOneDefLine (title, "note", new_note); 
2321     }
2322     
2323     orig_note = MemFree (orig_note);
2324     new_note = MemFree (new_note);
2325   }
2326   
2327   return title;
2328 }
2329 
2330 static Boolean IsUnrecognizedModifierName (ModifierInfoPtr mip, Boolean is_nuc);
2331 
2332 static void RemoveRecognizedModifiersFromTitle (CharPtr title, ValNodePtr modifier_info_list, Boolean is_nuc)
2333 {
2334   ValNodePtr      vnp;
2335   ModifierInfoPtr mip;
2336 
2337   for (vnp = modifier_info_list; vnp != NULL; vnp = vnp->next) {
2338     mip = (ModifierInfoPtr) vnp->data.ptrvalue;
2339     if (mip != NULL && ! IsUnrecognizedModifierName (mip, is_nuc)
2340         && (!is_nuc || mip->modtype != eModifierType_Protein)) {
2341       RemoveValueFromDefline (mip->name, title);
2342     }
2343   }
2344 }
2345 
2346 static void StripAllInstancesOfModNameFromTitle (CharPtr mod_name, CharPtr title)
2347 {
2348   CharPtr         valstr;
2349 
2350   valstr = FindValueFromPairInDefline (mod_name, title);
2351   while (valstr != NULL)
2352   {
2353     RemoveValueFromDefline (mod_name, title);
2354     valstr = MemFree (valstr);
2355     valstr = FindValueFromPairInDefline (mod_name, title);
2356   }     
2357 }
2358 
2359 static CharPtr RemoveAllDuplicatePairsFromOneTitle (CharPtr title)
2360 {
2361   CharPtr         start_bracket, end_bracket, tmp_title, new_title;
2362   ModifierInfoPtr mip;
2363   Int4            offset;
2364 
2365   mip = ParseOneBracketedModifier (title, &start_bracket, &end_bracket);
2366   while (mip != NULL && start_bracket != NULL && end_bracket != NULL)
2367   {
2368     offset = end_bracket - title + 1;
2369     tmp_title = StringSave (title + offset);
2370     tmp_title = ReplaceOneModifierValue (tmp_title, mip->name, mip->value, NULL,
2371                                      IsNonTextModifier (mip->name), FALSE);
2372     new_title = (CharPtr) MemNew ((StringLen (tmp_title) + offset + 1)* sizeof (Char));
2373     if (new_title != NULL)
2374     {
2375       StringNCpy (new_title, title, offset);
2376       StringCat (new_title, tmp_title);
2377     }
2378     tmp_title = MemFree (tmp_title);
2379     title = MemFree (title);
2380     title = new_title;
2381     mip = ModifierInfoFree (mip);
2382     mip = ParseOneBracketedModifier (title + offset, &start_bracket, &end_bracket);
2383   }
2384   mip = ModifierInfoFree (mip);
2385   return title;
2386 }
2387 
2388 static void ShiftString (CharPtr str, Int4 shift_size)
2389 {
2390   CharPtr src, dst;
2391   
2392   if (str == NULL)
2393   {
2394     return;
2395   }
2396   
2397   if (shift_size > (Int4) StringLen (str))
2398   {
2399     *str = 0;
2400   }
2401   else
2402   {
2403     src = str + shift_size;
2404     dst = str;
2405     while (*src != 0)
2406     {
2407       *dst = *src;
2408       dst++;
2409       src++;
2410     }
2411     *dst = 0;
2412   }  
2413 }
2414 
2415 static void RemoveMeaninglessEmptyPairsFromOneTitle (CharPtr title)
2416 {
2417   CharPtr         start_bracket, end_bracket;
2418   ModifierInfoPtr mip;
2419 
2420   mip = ParseOneBracketedModifier (title, &start_bracket, &end_bracket);
2421   while (mip != NULL && start_bracket != NULL && end_bracket != NULL)
2422   {
2423     if (StringHasNoText (mip->value) && ! IsNonTextModifier (mip->name))
2424     {
2425       ShiftString (start_bracket, end_bracket - start_bracket + 1);
2426       mip = ModifierInfoFree (mip);
2427       mip = ParseOneBracketedModifier (start_bracket, &start_bracket, &end_bracket);
2428     }
2429     else
2430     {
2431       mip = ModifierInfoFree (mip);
2432       mip = ParseOneBracketedModifier (end_bracket + 1, &start_bracket, &end_bracket);
2433     }
2434   }
2435   mip = ModifierInfoFree (mip);
2436 }
2437 
2438 static void ApplyOneModToSeqEntry (SeqEntryPtr sep, CharPtr mod_name, CharPtr mod_value)
2439 {
2440   BioseqPtr    bsp = NULL;
2441   SeqDescrPtr  sdp = NULL;
2442   
2443   if (sep == NULL || StringHasNoText (mod_name))
2444   {
2445     return;
2446   }
2447   
2448   if (IS_Bioseq (sep))
2449   {
2450     bsp = (BioseqPtr) sep->data.ptrvalue;
2451   }
2452   else if (IS_Bioseq_set (sep))
2453   {
2454     sep = FindNucSeqEntry (sep);
2455     if (sep != NULL && IS_Bioseq (sep))
2456     {
2457       bsp = (BioseqPtr) sep->data.ptrvalue;
2458     }
2459   }
2460   
2461   if (bsp == NULL)
2462   {
2463     return;
2464   }
2465   
2466   for (sdp = bsp->descr; sdp != NULL && sdp->choice != Seq_descr_title; sdp = sdp->next)
2467   {
2468   }
2469 
2470   if (sdp == NULL)
2471   {
2472     sdp = SeqDescrNew (NULL);
2473     sdp->choice = Seq_descr_title;
2474     if (bsp->descr == NULL)
2475     {
2476       bsp->descr = sdp;
2477     }
2478   }
2479   if (sdp != NULL)
2480   {
2481     sdp->data.ptrvalue = ReplaceValueInOneDefLine (sdp->data.ptrvalue,
2482                                                    mod_name, mod_value);
2483   }
2484   
2485   
2486 }
2487 
2488 static ModifierInfoPtr MakeModifierInfoFromNameAndValue (CharPtr value_name, CharPtr value_string)
2489 {
2490   ModifierInfoPtr mip;
2491   CharPtr tmp_pair;
2492   CharPtr fixed_value;
2493 
2494   fixed_value = AddQuotesToValueWithBrackets (value_string);  
2495   tmp_pair = (CharPtr) MemNew ((StringLen (value_name) + StringLen (fixed_value) + 4));
2496   if (tmp_pair == NULL)
2497   {
2498     return NULL;
2499   }
2500   sprintf (tmp_pair, "[%s=%s]", value_name == NULL ? "" : value_name,
2501                              fixed_value == NULL ? "" : fixed_value);
2502   mip = ParseOneBracketedModifier (tmp_pair, NULL, NULL);
2503   tmp_pair = MemFree (tmp_pair);
2504   fixed_value = MemFree (fixed_value);
2505   return mip;
2506 }
2507 
2508 /* This section is used to import tables of modifiers. */
2509 static CharPtr 
2510 ApplyImportModToTitle 
2511 (CharPtr title, 
2512  CharPtr value_name, 
2513  CharPtr value_string,
2514  Boolean erase_where_blank,
2515  Boolean parse_multiple)
2516 {
2517   ModifierInfoPtr mip;
2518   CharPtr next_semi, val_start, title_loc, title_end;
2519   CharPtr insert_point;
2520   Int4    insert_offset, title_val_num;
2521   Char    val_save_ch;
2522 
2523   if (StringHasNoText (value_name))
2524   {
2525     return title;
2526   }
2527   
2528   if (!erase_where_blank && StringHasNoText (value_string))
2529   {
2530     return title;
2531   }
2532   
2533   mip = MakeModifierInfoFromNameAndValue (value_name, value_string);
2534 
2535   if (mip == NULL 
2536       || (mip->modtype == eModifierType_SourceQual
2537                 && mip->subtype == 255
2538               && StringICmp (mip->name, "note-subsrc") != 0 
2539               && StringICmp (mip->name, "note-orgmod") != 0))
2540   {
2541     mip = ModifierInfoFree (mip);
2542     return title;
2543   }
2544   
2545   if (erase_where_blank && StringHasNoText (value_string))
2546   {
2547     RemoveValueFromDefline (value_name, title);
2548   }
2549   else if (parse_multiple
2550            && value_string [0] == '(' 
2551            && value_string [StringLen (value_string) - 1] == ')'
2552            && (next_semi = StringChr (value_string, ';')) != NULL)
2553   {
2554     val_start = value_string + 1;
2555     title_val_num = 0;
2556     while (next_semi != NULL)
2557     {
2558       /* temporarily truncate at end of value */
2559       val_save_ch = *next_semi;
2560       *next_semi = 0;
2561       
2562       title_loc = FindNthValuePairInDefLine (title, value_name, title_val_num, &title_end);
2563       if (StringHasNoText (val_start))
2564       {
2565         if (title_loc != NULL)
2566         {
2567           RemoveValuePairFromDefline (title_loc, title_end, title);
2568         }
2569         else
2570         {
2571           /* if text is empty and there is no value pair, nothing to do */
2572         }
2573         /* note - we do not increment title_val_num here because either we've
2574          * removed a value or there are no values left.
2575          */
2576       }
2577       else
2578       {
2579         if (title_loc == NULL)
2580         {
2581           /* need to insert a new value - if organism name, put at end of title,
2582            * otherwise insert before second organism name if any
2583            */
2584           if (StringICmp (value_name, "organism") == 0)
2585           {
2586             insert_offset = StringLen (title);
2587           }
2588           else
2589           {
2590             insert_point = FindNthValuePairInDefLine (title, "organism", 1, NULL);
2591             if (insert_point == NULL) 
2592             {
2593               insert_offset = StringLen (title);
2594             }
2595             else
2596             {
2597               insert_offset = insert_point - title;
2598             }
2599           }
2600           title = InsertValuePairAtOffset (title, value_name, val_start, insert_offset);
2601         }
2602         else
2603         {
2604           /* replace values in order */
2605           title = ReplaceValueInThisValuePair (title, title_loc, value_name, 
2606                                                title_end, val_start);
2607         }
2608         
2609         title_val_num++;
2610       }
2611       
2612       /* replace character */
2613       *next_semi = val_save_ch;
2614       /* advance to next value in list */
2615       val_start = next_semi + 1;      
2616       if (*next_semi == ';')
2617       {
2618         next_semi = StringChr (next_semi + 1, ';');
2619         if (next_semi == NULL)
2620         {
2621           next_semi = value_string + StringLen (value_string) - 1;
2622         }
2623       }
2624       else
2625       {
2626         next_semi = NULL;
2627       }
2628     }
2629   }
2630   else if (StringCmp (value_name, "organism") == 0)
2631   {
2632     title = ReplaceValueInOneDefLine (title, value_name, value_string);
2633   }
2634   else
2635   {
2636     title = ReplaceValueInOneDefLineForOrganism (title, value_name, value_string, NULL);
2637   }
2638  
2639   mip = ModifierInfoFree (mip);
2640   return title;
2641 }
2642 
2643 static ValNodePtr ReadRowListFromFile (void)
2644 {
2645   Char          path [PATH_MAX];
2646   ValNodePtr    header_line = NULL;
2647   FILE           *fp;
2648 
2649   path [0] = '\0';
2650   if (! GetInputFileName (path, sizeof (path), NULL, "TEXT")) return NULL;
2651   fp = FileOpen (path, "r");
2652   if (fp == NULL) {
2653     Message (MSG_ERROR, "Unable to open %s", path);
2654   } else {
2655     header_line = ReadTabTableFromFile (fp);
2656     FileClose (fp);
2657   }
2658   return header_line;
2659 }
2660 
2661 /* This function will find the sequence number in the IDAndTitleEdit
2662  * to use for each row and put that value in the sequence_numbers array.
2663  */
2664 static Boolean 
2665 ValidateModifierTableSequenceIDs 
2666 (ValNodePtr        header_line,
2667  IDAndTitleEditPtr iatep,
2668  Int4Ptr           sequence_numbers,
2669  Int4Ptr           num_rows)
2670 {
2671   ValNodePtr   not_found = NULL;
2672   ValNodePtr   found_more_than_once = NULL;
2673   CharPtr      too_many_msg = NULL, not_found_msg = NULL;
2674   Boolean      rval = TRUE;
2675   Int4         msg_len = 0;
2676   CharPtr      too_many_fmt = " found more than once\n";
2677   CharPtr      not_found_fmt = " not found\n";
2678   CharPtr      err_msg = NULL;
2679   ValNodePtr   row_vnp, col_vnp, prev_row, next_row;
2680   Int4         i, seq_num, other_instances;
2681   Boolean      found;
2682   Int4         row_number;
2683   Int4         deleted_rows;
2684   
2685   if (header_line == NULL || header_line->next == NULL || iatep == NULL
2686       || sequence_numbers == NULL || num_rows == NULL || *num_rows < ValNodeLen (header_line->next))
2687   {
2688     return FALSE;
2689   }
2690   
2691   for (row_vnp = header_line->next, row_number = 0; 
2692        row_vnp != NULL && row_number < *num_rows;
2693        row_vnp = row_vnp->next, row_number++)
2694   {
2695     col_vnp = row_vnp->data.ptrvalue;
2696     if (col_vnp == NULL || col_vnp->data.ptrvalue == NULL)
2697     {
2698       continue;
2699     }
2700     
2701     /* find correct sequence number */
2702     seq_num = -1;
2703     for (i = 0, found = FALSE; i < iatep->num_sequences && !found; i++)
2704     {
2705       if (StringCmp (iatep->id_list [i], col_vnp->data.ptrvalue) == 0)
2706       {
2707         seq_num = i;
2708         found = TRUE;
2709       }
2710     }
2711     sequence_numbers[row_number] = seq_num;
2712     
2713     if (!found)
2714     {
2715       ValNodeAddPointer (&not_found, 0, StringSave (col_vnp->data.ptrvalue));
2716     }
2717     else 
2718     {
2719       /* count the number of times this seq_num has already appeared in the list.*/
2720       other_instances = 0;
2721       for (prev_row = header_line->next; prev_row != row_vnp; prev_row = prev_row->next)
2722       {
2723         if (prev_row->choice == seq_num)
2724         {
2725           other_instances++;
2726         }
2727       }
2728       /* if the value was found exactly once, add this to the list of duplicates.
2729        * if the value was found more than once, it will already have been reported.
2730        */
2731       if (other_instances == 1)
2732       {
2733         ValNodeAddPointer (&found_more_than_once, 0, StringSave (col_vnp->data.ptrvalue));
2734       }
2735     }
2736   }
2737   
2738   if (found_more_than_once != NULL || not_found != NULL)
2739   {
2740     if (found_more_than_once != NULL)
2741     {
2742       too_many_msg = CreateListMessage ("Sequence ID", NULL, found_more_than_once);
2743       rval = FALSE;
2744       msg_len += StringLen (too_many_msg) + StringLen (too_many_fmt) + 5;
2745     }
2746     if (not_found != NULL)
2747     {
2748       not_found_msg = CreateListMessage ("Sequence ID", NULL, not_found);
2749       msg_len += StringLen (not_found_msg) + StringLen (not_found_fmt) + 5;
2750     }
2751     
2752     err_msg = (CharPtr) MemNew ((msg_len + 1) * sizeof (Char));
2753     if (err_msg != NULL)
2754     {
2755       if (too_many_msg != NULL)
2756       {
2757         StringCat (err_msg, too_many_msg);
2758         if (found_more_than_once->next != NULL)
2759         {
2760           StringCat (err_msg, " were");
2761         }
2762         else
2763         {
2764           StringCat (err_msg, " was");
2765         }
2766         StringCat (err_msg, too_many_fmt);
2767       }
2768       if (not_found_msg != NULL)
2769       {
2770         StringCat (err_msg, not_found_msg);
2771         if (not_found->next != NULL)
2772         {
2773           StringCat (err_msg, " were");
2774         }
2775         else
2776         {
2777           StringCat (err_msg, " was");
2778         }
2779         StringCat (err_msg, not_found_fmt);
2780       }
2781       if (rval)
2782       {
2783         if (ANS_NO == Message (MSG_YN, "%sContinue anyway?", err_msg))
2784         {
2785           rval = FALSE;
2786         }
2787       }
2788       else
2789       {
2790         Message (MSG_ERROR, "%sPlease correct your file.", err_msg);
2791       }
2792     }
2793     too_many_msg = MemFree (too_many_msg);
2794     not_found_msg = MemFree (not_found_msg);
2795     err_msg = MemFree (err_msg);
2796   }
2797   
2798   /* remove rows for sequence IDs that are not found */
2799   for (row_vnp = header_line->next, row_number = 0, prev_row = header_line; 
2800        row_vnp != NULL && row_number < *num_rows;
2801        row_vnp = next_row, row_number++)
2802   {
2803     next_row = row_vnp->next;
2804     if (sequence_numbers[row_number] < 0) {
2805       prev_row->next = next_row;
2806       row_vnp->next = NULL;
2807       row_vnp = FreeTableDisplayRowList (row_vnp);
2808     } else {
2809       prev_row = row_vnp;
2810     } 
2811   }
2812   
2813   /* now remove sequence_numbers entries */
2814   deleted_rows = 0;
2815   row_number = 0;
2816   while (row_number < *num_rows)
2817   {
2818     if (sequence_numbers[row_number] < 0) {
2819       for (i = row_number + 1; i < *num_rows; i++) {
2820         sequence_numbers[i - 1] = sequence_numbers[i];
2821       }
2822       (*num_rows)--;
2823     } else {
2824       row_number++;
2825     }
2826   }   
2827   
2828   return rval;
2829 }
2830 
2831 /* This checks the column names and puts the modifier type in the choice for each column */
2832 static Boolean ValidateImportModifierColumnNames (ValNodePtr header_line)
2833 {
2834   ValNodePtr      header_vnp;
2835   Boolean         rval = TRUE;
2836   ModifierInfoPtr mip;
2837   CharPtr         orig_name;
2838   Int4            col_num;
2839   
2840   if (header_line == NULL)
2841   {
2842     return FALSE;
2843   }
2844   
2845   header_vnp = header_line->data.ptrvalue;
2846   if (header_vnp == NULL || header_vnp->next == NULL)
2847   {
2848     return FALSE;
2849   }
2850   
2851   /* check ID column */
2852   if (StringICmp (header_vnp->data.ptrvalue, "local_id") != 0
2853       && StringICmp (header_vnp->data.ptrvalue, "local id") != 0
2854       && StringICmp (header_vnp->data.ptrvalue, "local-id") != 0
2855       && StringICmp (header_vnp->data.ptrvalue, "seq_id") != 0
2856       && StringICmp (header_vnp->data.ptrvalue, "seq id") != 0
2857       && StringICmp (header_vnp->data.ptrvalue, "seq-id") != 0
2858       && StringICmp (header_vnp->data.ptrvalue, "seqid") != 0
2859       && StringICmp (header_vnp->data.ptrvalue, "sequence_id") != 0
2860       && StringICmp (header_vnp->data.ptrvalue, "sequence id") != 0
2861       && StringICmp (header_vnp->data.ptrvalue, "sequence-id") != 0
2862       )
2863   {
2864     Message (MSG_ERROR, "Table file is missing header line!  Make sure first column header is seq_id");
2865     return FALSE;      
2866   }
2867   header_vnp = header_vnp->next;
2868   col_num = 1;
2869   while (header_vnp != NULL && rval)
2870   {
2871     mip = MakeModifierInfoFromNameAndValue (header_vnp->data.ptrvalue, NULL);
2872     if (mip == NULL 
2873       || (mip->modtype == eModifierType_SourceQual
2874                 && mip->subtype == 255
2875               && StringICmp (mip->name, "note-subsrc") != 0 
2876               && StringICmp (mip->name, "note-orgmod") != 0))
2877     {
2878       orig_name = (CharPtr) header_vnp->data.ptrvalue;
2879       rval = ReplaceImportModifierName (&orig_name, col_num);
2880       header_vnp->data.ptrvalue = orig_name;
2881     }
2882     else
2883     {
2884       header_vnp->data.ptrvalue = MemFree (header_vnp->data.ptrvalue);
2885       header_vnp->data.ptrvalue = StringSave (mip->name);
2886       header_vnp->choice = mip->modtype;
2887     }
2888     mip = ModifierInfoFree (mip);
2889     header_vnp = header_vnp->next;
2890     col_num++;
2891   }
2892   return rval;
2893 }
2894 
2895 static Boolean StringAlreadyInList (ValNodePtr list, CharPtr str)
2896 {
2897   while (list != NULL)
2898   {
2899     if (StringICmp (list->data.ptrvalue, str) == 0)
2900     {
2901       return TRUE;
2902     }
2903     list = list->next;
2904   }
2905   return FALSE;
2906 }
2907 
2908 static Boolean ValidateTableValues (ValNodePtr header_line)
2909 {
2910   ValNodePtr      header_vnp, row_vnp, col_vnp;
2911   Boolean         rval = TRUE;
2912   ModifierInfoPtr mip;
2913   Int4            col_num;
2914   ValNodePtr      bad_value_columns = NULL;
2915   ValNodePtr      bad_nontext_columns = NULL;
2916   CharPtr         err_msg;
2917   
2918   if (header_line == NULL || header_line->next == NULL 
2919       || header_line->data.ptrvalue == NULL)
2920   {
2921     return FALSE;
2922   }
2923     
2924   for (row_vnp = header_line->next; row_vnp != NULL; row_vnp = row_vnp->next)
2925   {
2926     /* skip rows with bad sequence IDs */
2927     if (row_vnp->data.ptrvalue == NULL)
2928     {
2929       continue;
2930     }
2931     
2932     header_vnp = header_line->data.ptrvalue;
2933     col_vnp = row_vnp->data.ptrvalue;
2934     /* skip ID column */
2935     header_vnp = header_vnp->next;
2936     col_vnp = col_vnp->next;
2937     for (col_num = 1; 
2938          header_vnp != NULL && col_vnp != NULL; 
2939          header_vnp = header_vnp->next, col_vnp = col_vnp->next, col_num++)
2940     {
2941       mip = MakeModifierInfoFromNameAndValue (header_vnp->data.ptrvalue, 
2942                                               col_vnp->data.ptrvalue);
2943       if (mip->modtype == eModifierType_SourceQual
2944                      && IsNonTextModifier (mip->name))
2945       {
2946         if (StringICmp (mip->value, "TRUE") != 0
2947             && StringICmp (mip->value, "FALSE") != 0)
2948         {
2949           if (!StringAlreadyInList (bad_nontext_columns, header_vnp->data.ptrvalue))
2950           {
2951             ValNodeAddPointer (&bad_nontext_columns, col_num, StringSave (header_vnp->data.ptrvalue));
2952           }
2953         }
2954       }
2955       else if (ModifierHasInvalidValue (mip))
2956       {
2957         if (!StringAlreadyInList (bad_value_columns, header_vnp->data.ptrvalue))
2958         {
2959           ValNodeAddPointer (&bad_value_columns, col_num, StringSave (header_vnp->data.ptrvalue));
2960         }
2961       }
2962       mip = ModifierInfoFree (mip);
2963     }
2964   }
2965   
2966   if (bad_value_columns != NULL)
2967   {
2968     err_msg = CreateListMessage ("Your file contains invalid values for column",
2969                                  ". Please edit your file to list valid values.",
2970                                  bad_value_columns);
2971     Message (MSG_ERROR, err_msg);                                 
2972     rval = FALSE;
2973   }
2974   if (bad_nontext_columns != NULL && rval)
2975   {
2976     err_msg = CreateListMessage ("Your file contains values other than TRUE or FALSE for column",
2977                                  ". These modifiers do not allow other text.  Click OK to "
2978                                  "discard this text and mark the values as TRUE.  If you "
2979                                  "wish to preserve this text under another modifier, click "
2980                                  "Cancel and change the column header in your file.",
2981                                  bad_nontext_columns);
2982     if (ANS_CANCEL == Message (MSG_OKC, err_msg))
2983     {
2984       rval = FALSE;
2985     }
2986   }
2987   
2988   bad_value_columns = ValNodeFreeData (bad_value_columns);
2989   bad_nontext_columns = ValNodeFreeData (bad_nontext_columns);
2990   return rval;
2991 }
2992 
2993 static Boolean 
2994 CheckModifiersForOverwrite 
2995 (ValNodePtr        header_line,
2996  IDAndTitleEditPtr iatep,
2997  Int4Ptr           sequence_numbers,
2998  Int4              num_rows,
2999  BoolPtr           erase_where_blank,
3000  BoolPtr           parse_multiple)
3001 {
3002   ValNodePtr row_vnp, header_vnp, col_vnp;
3003   CharPtr    title_val, data_val;
3004   ValNodePtr blank_column_list = NULL;
3005   ValNodePtr replace_column_list = NULL;
3006   ValNodePtr parse_multi_list = NULL;
3007   Int4       col_num, row_num;
3008   Boolean    rval = TRUE;
3009   CharPtr    err_msg;
3010   MsgAnswer  ans;
3011   
3012   if (header_line == NULL || header_line->next == NULL || iatep == NULL
3013       || sequence_numbers == NULL || num_rows < ValNodeLen (header_line->next)
3014       || erase_where_blank == NULL || parse_multiple == NULL)
3015   {
3016     return FALSE;
3017   }
3018   
3019   *erase_where_blank = FALSE;
3020   *parse_multiple = FALSE;
3021   
3022   for (row_vnp = header_line->next, row_num = 0;
3023        row_vnp != NULL && row_num < num_rows;
3024        row_vnp = row_vnp->next, row_num++)
3025   {
3026     if (row_vnp->data.ptrvalue == NULL)
3027     {
3028       continue;
3029     }
3030     header_vnp = header_line->data.ptrvalue;
3031     col_vnp = row_vnp->data.ptrvalue;
3032     
3033     /* skip ID column */
3034     header_vnp = header_vnp->next;
3035     col_vnp = col_vnp->next;
3036     
3037     col_num = 1;
3038     while (header_vnp != NULL && col_vnp != NULL)
3039     {
3040       /* if column name is blank, skip */
3041       if (header_vnp->data.ptrvalue != NULL)
3042       {
3043         title_val = FindValueFromPairInDefline (header_vnp->data.ptrvalue,
3044                                                 iatep->title_list [sequence_numbers[row_num]]);
3045         data_val = col_vnp->data.ptrvalue;
3046         if (!StringHasNoText (title_val))
3047         {
3048           if (StringHasNoText (data_val))
3049           {
3050             /* add to list of possible erasures */
3051             if (!StringAlreadyInList (blank_column_list, header_vnp->data.ptrvalue))
3052             {
3053               ValNodeAddPointer (&blank_column_list, col_num, StringSave (header_vnp->data.ptrvalue));
3054             }
3055           }
3056           else if (StringCmp (data_val, title_val) != 0)
3057           {
3058             /* add to list of possible replacements */
3059             if (!StringAlreadyInList (replace_column_list, header_vnp->data.ptrvalue))
3060             {
3061               ValNodeAddPointer (&replace_column_list, col_num, StringSave (header_vnp->data.ptrvalue));
3062             }
3063           }
3064         }
3065         title_val = MemFree (title_val);
3066         /* check for multival parsing */
3067         if (data_val != NULL 
3068             && data_val [0] == '(' && data_val [StringLen (data_val) - 1] == ')'
3069             && StringChr (data_val, ';') != NULL
3070             && !StringAlreadyInList (parse_multi_list, header_vnp->data.ptrvalue))
3071         {
3072           ValNodeAddPointer (&parse_multi_list, col_num, StringSave (header_vnp->data.ptrvalue));
3073         }
3074       }
3075       header_vnp = header_vnp->next;
3076       col_vnp = col_vnp->next;
3077       col_num++;
3078     }
3079   }
3080     
3081   if (replace_column_list != NULL)
3082   {
3083     err_msg = CreateListMessage ("Record already contains values for column",
3084                                  " also found in the import table.\n"
3085                                  "Do you wish to overwrite these values?",
3086                                  replace_column_list);
3087     if (ANS_NO == Message (MSG_YN, err_msg))
3088     {
3089       rval = FALSE;
3090     }
3091     err_msg = MemFree (err_msg);
3092   }
3093   
3094   if (blank_column_list != NULL && rval)
3095   {
3096     err_msg = CreateListMessage ("Your import table contains blanks in column",
3097                                  " where data already exists in the sequences.\n"
3098                                  "Do you wish to erase these values in the sequences?\n"
3099                                  "If you say no, the old values will remain.",
3100                                  blank_column_list);
3101     ans = Message (MSG_YNC, err_msg);
3102     err_msg = MemFree (err_msg);
3103     if (ans == ANS_CANCEL)
3104     {
3105       rval = FALSE;
3106     }
3107     else if (ans == ANS_YES)
3108     {
3109       *erase_where_blank = TRUE;
3110     }
3111   }
3112 
3113 #if 0  
3114   /* ability to parse multiple entry format removed (for now) */
3115   if (parse_multi_list != NULL && rval)
3116   {
3117     err_msg = CreateListMessage ("Your import table contains values in column",
3118                                  " where the values are in form '(value1;value2)'.\n"
3119                                  "Do you wish to parse these values into multiple modifiers?\n"
3120                                  "If you say no, the values will be applied to a single modifier.",
3121                                  parse_multi_list);
3122     ans = Message (MSG_YNC, err_msg);
3123     err_msg = MemFree (err_msg);
3124     if (ans == ANS_CANCEL)
3125     {
3126       rval = FALSE;
3127     }
3128     else if (ans == ANS_YES)
3129     {
3130       *parse_multiple = TRUE;
3131     }
3132   }
3133 #endif  
3134   
3135   blank_column_list = ValNodeFree (blank_column_list);  
3136   replace_column_list = ValNodeFreeData (replace_column_list);
3137   parse_multi_list = ValNodeFreeData (parse_multi_list);
3138   
3139   return rval;
3140 }
3141 
3142 static Boolean ImportModifiersToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3143 {
3144   ValNodePtr   header_line, row_vnp, col_vnp, header_vnp;
3145   Boolean      erase_where_blank = FALSE, parse_multi = FALSE;
3146   Int4Ptr      sequence_numbers;
3147   Int4         num_rows, row_number;
3148   
3149   if (iatep == NULL)
3150   {
3151     return FALSE;
3152   }
3153 
3154   SendHelpScrollMessage (helpForm, "Organism Page", "Import Source Modifiers");  
3155   
3156   header_line = ReadRowListFromFile ();
3157   if (header_line == NULL || header_line->next == NULL)
3158   {
3159     header_line = FreeTableDisplayRowList (header_line);
3160     return FALSE;
3161   }
3162   
3163   header_vnp = header_line->data.ptrvalue;
3164   if (header_vnp == NULL || header_vnp->next == NULL)
3165   {
3166     header_line = FreeTableDisplayRowList (header_line);
3167     return FALSE;
3168   }
3169   
3170   num_rows = ValNodeLen (header_line->next);
3171   sequence_numbers = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
3172   
3173   if (!ValidateModifierTableSequenceIDs (header_line, iatep, sequence_numbers, &num_rows))
3174   {
3175     header_line = FreeTableDisplayRowList (header_line);
3176     sequence_numbers = MemFree (sequence_numbers);
3177     return FALSE;
3178   }
3179   
3180   /* first, validate all column names and values */
3181   if (!ValidateImportModifierColumnNames (header_line))
3182   {
3183     header_line = FreeTableDisplayRowList (header_line);
3184     sequence_numbers = MemFree (sequence_numbers);
3185     return FALSE;
3186   }
3187   
3188   if (!ValidateTableValues (header_line))
3189   {
3190     header_line = FreeTableDisplayRowList (header_line);
3191     sequence_numbers = MemFree (sequence_numbers);
3192     return FALSE;
3193   }
3194   
3195   if (!CheckModifiersForOverwrite (header_line, iatep, 
3196                                    sequence_numbers, num_rows, 
3197                                    &erase_where_blank, &parse_multi))
3198   {
3199     header_line = FreeTableDisplayRowList (header_line);
3200     sequence_numbers = MemFree (sequence_numbers);
3201     return FALSE;
3202   }
3203   
3204   /* now apply */
3205   for (row_vnp = header_line->next, row_number = 0;
3206        row_vnp != NULL && row_number < num_rows; 
3207        row_vnp = row_vnp->next, row_number++)
3208   {
3209     if (row_vnp->data.ptrvalue == NULL)
3210     {
3211       continue;
3212     }
3213     header_vnp = header_line->data.ptrvalue;
3214     col_vnp = row_vnp->data.ptrvalue;
3215     
3216     /* skip the ID column */
3217     header_vnp = header_vnp->next;
3218     col_vnp = col_vnp->next;
3219     
3220     for (;
3221          header_vnp != NULL && col_vnp != NULL;
3222          header_vnp = header_vnp->next, col_vnp = col_vnp->next)
3223     {
3224       iatep->title_list [sequence_numbers [row_number]] = ApplyImportModToTitle (iatep->title_list [sequence_numbers[row_number]],
3225                                                                    header_vnp->data.ptrvalue,
3226                                                                    col_vnp->data.ptrvalue,
3227                                                                    erase_where_blank,
3228                                                                    parse_multi);
3229     }
3230   }
3231   sequence_numbers = MemFree (sequence_numbers);  
3232   return TRUE;
3233 }
3234 
3235 typedef struct fastapage {
3236   DIALOG_MESSAGE_BLOCK
3237   Char         path [PATH_MAX];
3238   SeqEntryPtr  list;
3239   ValNodePtr   errmsgs;
3240   DoC          doc;
3241   GrouP        instructions;
3242   GrouP        have_seq_instr_grp;
3243   GrouP        singleIdGrp;
3244   TexT         singleSeqID;  
3245   Boolean      is_na;
3246   Boolean      is_mrna;
3247   Boolean      is_delta;
3248   Boolean      parseSeqId;
3249   Boolean      single;
3250   Int2Ptr      seqPackagePtr;
3251   ButtoN       import_btn;
3252   ButtoN       clear_btn;
3253 } FastaPage, PNTR FastaPagePtr;
3254 
3255 static ParData faParFmt = {FALSE, FALSE, FALSE, FALSE, FALSE, 0, 0};
3256 static ColData faColFmt = {0, 0, 80, 0, NULL, 'l', TRUE, FALSE, FALSE, FALSE, TRUE};
3257 
3258 static void ResetFastaPage (FastaPagePtr fpp)
3259 
3260 {
3261   SeqEntryPtr  next;
3262   SeqEntryPtr  sep;
3263 
3264   if (fpp != NULL) {
3265     sep = fpp->list;
3266     while (sep != NULL) {
3267       next = sep->next;
3268       sep->next = NULL;
3269       SeqEntryFree (sep);
3270       sep = next;
3271     }
3272     fpp->list = NULL;
3273     fpp->errmsgs = ValNodeFreeData (fpp->errmsgs);
3274   }
3275 }
3276 
3277 static CharPtr GetModValueFromSeqEntry (SeqEntryPtr sep, CharPtr mod_name)
3278 {
3279   CharPtr ttl = NULL;
3280   CharPtr value = NULL;
3281   
3282   if (sep == NULL || StringHasNoText (mod_name))
3283   {
3284     return NULL;
3285   }
3286 
3287   SeqEntryExplore (sep, (Pointer) (&ttl), FindFirstTitle);
3288   if (StringHasNoText (ttl))
3289   {
3290     return NULL;
3291   }
3292   
3293   value =  FindValueFromPairInDefline (mod_name, ttl);
3294   
3295   return value;  
3296 }
3297 
3298 static void AddReportLine (CharPtr str, CharPtr name, CharPtr tmp)
3299 
3300 {
3301   StringCat (str, name);
3302   StringCat (str, ": ");
3303   StringCat (str, tmp);
3304   StringCat (str, "\n");
3305 }
3306 
3307 static CharPtr GetDisplayValue (CharPtr mod_name, CharPtr title, BoolPtr multi_found);
3308 static CharPtr GetDisplayValueFromModifierInfoList (CharPtr mod_name, ValNodePtr modifier_info_list, BoolPtr multi_found);
3309 
3310 static void ReportModifiers (CharPtr str, CharPtr report_name,
3311                              ValNodePtr modifier_info_list, CharPtr mod_name, CharPtr not_found_msg)
3312 {
3313   CharPtr valstr;
3314   Boolean multi_found = TRUE;
3315 
3316   valstr = GetDisplayValueFromModifierInfoList (mod_name, modifier_info_list, &multi_found);
3317   if (IsNonTextModifier (mod_name) && StringICmp (valstr, "FALSE") == 0)
3318   {
3319         valstr = MemFree (valstr);
3320   }
3321 
3322   if (!StringHasNoText (valstr)) {
3323     AddReportLine (str, report_name, valstr);
3324   } else if (!StringHasNoText (not_found_msg)) {
3325     StringCat (str, not_found_msg);
3326   }
3327   valstr = MemFree (valstr);
3328 }
3329 
3330 static void LookupAndAddReportLine (CharPtr str, CharPtr report_name, 
3331                                     CharPtr title, CharPtr mod_name, CharPtr not_found_msg)
3332 {
3333   CharPtr valstr;
3334   Boolean multi_found = TRUE;
3335   
3336   valstr = GetDisplayValue (mod_name, title, &multi_found);
3337   if (IsNonTextModifier (mod_name) && StringICmp (valstr, "FALSE") == 0)
3338   {
3339         valstr = MemFree (valstr);
3340   }
3341 
3342   if (!StringHasNoText (valstr)) {
3343     AddReportLine (str, report_name, valstr);
3344   } else if (!StringHasNoText (not_found_msg)) {
3345     StringCat (str, not_found_msg);
3346   }
3347   valstr = MemFree (valstr);
3348 }
3349 
3350 static void LookupAndAddLocationReportLine (CharPtr str, CharPtr title)
3351 {
3352   CharPtr valstr;
3353   
3354   valstr = FindValueFromPairInDefline ("location", title);
3355   if (!StringHasNoText (valstr) && StringICmp (valstr, "genomic") != 0) {
3356     AddReportLine (str, "Location", valstr);
3357   }
3358   valstr = MemFree (valstr);
3359 }
3360 
3361 static CharPtr singlewarn = "\
3362 ERROR - You may not enter multiple segments for a single sequence submission.\
3363 You should either clear the nucleotide and import a single FASTA record, or\
3364 return to the Sequence Format form and choose the proper submission type.\n\n";
3365 
3366 #define FastaFormatBufLen 2000
3367 
3368 static Int4 CountSegSetSegments (SeqEntryPtr sep)
3369 {
3370   BioseqSetPtr bssp;
3371   
3372   if (sep == NULL || sep->data.ptrvalue == NULL || ! IS_Bioseq_set (sep))
3373   {
3374     return 0;
3375   }
3376   
3377   bssp = (BioseqSetPtr) sep->data.ptrvalue;
3378   if (bssp->_class != BioseqseqSet_class_segset)
3379   {
3380     return 0;
3381   }
3382   sep = bssp->seq_set;
3383   
3384   while (sep != NULL)
3385   {
3386     if (IS_Bioseq_set (sep) && sep->data.ptrvalue != NULL)
3387     {
3388       bssp = (BioseqSetPtr) sep->data.ptrvalue;
3389       if (bssp->_class == BioseqseqSet_class_parts)
3390       {
3391         return ValNodeLen (bssp->seq_set);
3392       }
3393     }
3394     sep = sep->next;
3395   }
3396   return 0;
3397 }
3398 
3399 static void FormatFastaDoc (FastaPagePtr fpp)
3400 
3401 {
3402   Nlm_QualNameAssocPtr ap;
3403   BioseqPtr          bsp;
3404   Boolean            hasErrors;
3405   CharPtr            label;
3406   Int4               len;
3407   CharPtr            measure;
3408   SeqEntryPtr        nsep = NULL;
3409   Int2               num;
3410   CharPtr            plural;
3411   CharPtr            ptr;
3412   SeqIdPtr           sip;
3413   SeqEntryPtr        sep;
3414   CharPtr            str;
3415   CharPtr            title;
3416   CharPtr            ttl;
3417   CharPtr            tmp;
3418   ValNodePtr         vnp;
3419   Int4               num_seg;
3420   CharPtr            valstr;
3421   ValNodePtr         modifier_info_list = NULL;
3422 
3423   if (fpp != NULL) {
3424     str = MemNew (sizeof (char) * FastaFormatBufLen);
3425     tmp = MemNew (sizeof (char) * FastaFormatBufLen);
3426     if (str == NULL || tmp == NULL) return;
3427     num = 0;
3428     len = 0;
3429     hasErrors = FALSE;
3430     for (sep = fpp->list; sep != NULL; sep = sep->next) {
3431       num++;
3432       if (IS_Bioseq (sep)) {
3433         bsp = (BioseqPtr) sep->data.ptrvalue;
3434         if (bsp != NULL) {
3435           len += bsp->length;
3436         }
3437       } else if (IS_Bioseq_set (sep)) {
3438         nsep = FindNucSeqEntry (sep);
3439         if (nsep != NULL && IS_Bioseq (nsep)) {
3440           bsp = (BioseqPtr) nsep->data.ptrvalue;
3441           if (bsp != NULL) {
3442             len += bsp->length;
3443           }
3444         }
3445       }
3446     }
3447     if (num > 1) {
3448       plural = "s";
3449     } else {
3450       plural = "";
3451     }
3452     if (fpp->single && num > 1) {
3453       AppendText (fpp->doc, singlewarn, &faParFmt, &faColFmt, programFont);
3454       hasErrors = TRUE;
3455     }
3456     if (fpp->is_mrna) {
3457       label = "Message";
3458       measure = "nucleotides";
3459     } else if (fpp->is_na) {
3460       label = "Sequence";
3461       measure = "bases";
3462     } else {
3463       label = "Sequence";
3464       measure = "amino acids";
3465     }
3466     if (fpp->is_mrna) {
3467       sprintf (str, "%d transcript sequence%s, total length %ld %s\n",
3468                (int) num, plural, (long) len, measure);
3469     } else if (fpp->is_na) {
3470       sprintf (str, "%d nucleotide sequence%s, total length %ld %s\n",
3471                (int) num, plural, (long) len, measure);
3472     } else {
3473       sprintf (str, "%d protein sequence%s, total length %ld %s\n",
3474                (int) num, plural, (long) len, measure);
3475     }
3476     AppendText (fpp->doc, str, &faParFmt, &faColFmt, programFont);
3477     vnp = fpp->errmsgs;
3478     num = 0;
3479     for (sep = fpp->list; sep != NULL; sep = sep->next) {
3480       num++;
3481       len = 0;
3482       num_seg = CountSegSetSegments (sep);
3483       sip = NULL;
3484       tmp [0] = '\0';
3485       if (IS_Bioseq (sep)) {
3486         bsp = (BioseqPtr) sep->data.ptrvalue;
3487         if (bsp != NULL) {
3488           len = bsp->length;
3489           sip = SeqIdFindWorst (bsp->id);
3490           SeqIdWrite (sip, tmp, PRINTID_REPORT, FastaFormatBufLen);
3491         }
3492         nsep = sep;
3493       } else if (IS_Bioseq_set (sep)) {
3494         nsep = FindNucSeqEntry (sep);
3495         if (nsep != NULL && IS_Bioseq (nsep)) {
3496           bsp = (BioseqPtr) nsep->data.ptrvalue;
3497           if (bsp != NULL) {
3498             len = bsp->length;
3499             sip = SeqIdFindWorst (bsp->id);
3500             SeqIdWrite (sip, tmp, PRINTID_REPORT, FastaFormatBufLen);
3501           }
3502         }
3503       }
3504       
3505       /* if segmented set, show number of segments */
3506       if (num_seg > 0)
3507       {
3508         sprintf (str, "\nSegset %d Sequence ID: %s\nLength: %ld %s (%d segments)\n",
3509                  (int) num, tmp, (long) len, measure, num_seg);
3510       }
3511       else
3512       {
3513         sprintf (str, "\n%s %d Sequence ID: %s\nLength: %ld %s\n", label,
3514                  (int) num, tmp, (long) len, measure);
3515       }
3516       ttl = NULL;
3517       SeqEntryExplore (nsep, (Pointer) (&ttl), FindFirstTitle);
3518       title = StringSaveNoNull (ttl);
3519       modifier_info_list = ParseAllBracketedModifiers (title);
3520       if (title != NULL && (! fpp->is_na)) {
3521 
3522         ReportModifiers (str, "Gene", modifier_info_list, "gene", "No gene name detected\n");
3523         ReportModifiers (str, "Protein", modifier_info_list, "protein", "No protein name detected\n");
3524         ReportModifiers (str, "Gene Syn", modifier_info_list, "gene_syn", NULL);
3525         ReportModifiers (str, "Protein Desc", modifier_info_list, "protein_desc", NULL);
3526 
3527         ptr = StringISearch (title, "[orf]");
3528         if (ptr != NULL) {
3529         StringCat (str, "ORF indicated\n");
3530         }
3531         ReportModifiers (str, "Protein Comment", modifier_info_list, "comment", NULL);
3532       }
3533 
3534       if (title != NULL && fpp->is_na && (! fpp->is_mrna)) {
3535         ReportModifiers (str, "Organism", modifier_info_list, "organism", NULL);
3536         ReportModifiers (str, "Lineage", modifier_info_list, "lineage", NULL);
3537         for (ap = current_orgmod_subtype_alist; ap->name != NULL; ap++) {
3538           ReportModifiers (str, ap->name, modifier_info_list, ap->name, NULL); 
3539         }
3540         for (ap = current_subsource_subtype_alist; ap->name != NULL; ap++) {
3541           ReportModifiers (str, ap->name, modifier_info_list, ap->name, NULL); 
3542         }
3543         LookupAndAddReportLine (str, "Note", title, "note", NULL); 
3544         LookupAndAddReportLine (str, "Note", title, "subsource", NULL); 
3545         LookupAndAddReportLine (str, "Molecule", title, "molecule", NULL); 
3546         LookupAndAddReportLine (str, "MolType", title, "moltype", NULL); 
3547         LookupAndAddLocationReportLine (str, title); 
3548         LookupAndAddReportLine (str, "Genetic Code", title, "genetic_code", NULL);
3549       }
3550 
3551       if (title != NULL && fpp->is_na && fpp->is_mrna) {
3552         LookupAndAddReportLine (str, "Gene", title, "gene", "No gene name detected\n"); 
3553         valstr = FindValueFromPairInDefline ("mrna", title);
3554         if (!StringHasNoText (valstr)) {
3555           AddReportLine (str, "mRNA", valstr);
3556           valstr = MemFree (valstr);
3557         } else {
3558           valstr = MemFree (valstr);
3559           valstr = FindValueFromPairInDefline ("cdna", title);
3560           if (!StringHasNoText (valstr)) {
3561             AddReportLine (str, "cDNA", valstr);
3562           } else {
3563             StringCat (str, "No mRNA name detected\n");
3564           }
3565           valstr = MemFree (valstr);
3566         }
3567         LookupAndAddReportLine (str, "Comment", title, "comment", NULL); 
3568       }
3569       MemFree (title);
3570       ttl = NULL;
3571       SeqEntryExplore (nsep, (Pointer) (&ttl), FindFirstTitle);
3572       title = StringSaveNoNull (ttl);
3573       if (title != NULL) {
3574         RemoveRecognizedModifiersFromTitle (title, modifier_info_list, fpp->is_na);
3575         if (fpp->is_mrna) {
3576           StripAllInstancesOfModNameFromTitle ("gene", title);
3577           StripAllInstancesOfModNameFromTitle ("mrna", title);
3578           StripAllInstancesOfModNameFromTitle ("cdna", title);
3579           StripAllInstancesOfModNameFromTitle ("comment", title);
3580         } 
3581         TrimSpacesAroundString (title);
3582         if (! StringHasNoText (title)) {
3583           StringCat (str, "Title: ");
3584           StringNCat (str, title, 128);
3585           StringCat (str, "\n");
3586         } else {
3587           StringCat (str, "No title detected\n");
3588         }
3589       }
3590       MemFree (title);
3591       ModifierInfoListFree (modifier_info_list);
3592       if (vnp != NULL && vnp->data.ptrvalue != NULL) {
3593         hasErrors = TRUE;
3594         StringCat (str, (CharPtr) vnp->data.ptrvalue);
3595         StringCat (str, "\n");
3596       }
3597       AppendText (fpp->doc, str, &faParFmt, &faColFmt, programFont);
3598       if (vnp != NULL) {
3599         vnp = vnp->next;
3600       }
3601     }
3602     MemFree (str);
3603     MemFree (tmp);
3604     UpdateDocument (fpp->doc, 0, 0);
3605     if (hasErrors) {
3606       Beep ();
3607       Beep ();
3608       Beep ();
3609     }
3610   }
3611 }
3612 
3613 extern SeqEntryPtr ImportOneGappedSequence (FILE *fp)
3614 {
3615   BioseqPtr      bsp;
3616   Pointer        dataptr;
3617   Uint2          datatype;
3618   SeqEntryPtr    topsep;
3619   SeqSubmitPtr   ssp;
3620   ErrSev         oldsev;
3621   
3622   if (fp == NULL) return NULL;
3623   
3624   oldsev = ErrSetMessageLevel (SEV_MAX);
3625   bsp = ReadDeltaFasta (fp, NULL);
3626   ErrSetMessageLevel (oldsev);
3627   if (bsp == NULL)
3628   {
3629     topsep = NULL;
3630     dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE,
3631                                     TRUE, FALSE);
3632     if (dataptr != NULL)
3633     {
3634       /* Get a pointer to the new SeqEntry */
3635       if (datatype == OBJ_SEQENTRY)
3636       {
3637         topsep = (SeqEntryPtr) dataptr;
3638       }
3639       else if (datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET)
3640       {
3641         topsep = SeqMgrGetSeqEntryForData (dataptr);
3642       }
3643       else if (datatype == OBJ_SEQSUB) 
3644       {
3645         ssp = (SeqSubmitPtr) dataptr;
3646         if (ssp != NULL && ssp->datatype == 1)
3647         {
3648           topsep = (SeqEntryPtr) ssp->data;
3649         }
3650       }
3651     }
3652   }
3653   else
3654   {
3655     topsep = SeqMgrGetSeqEntryForData (bsp);
3656   }
3657 
3658   return topsep;
3659 }
3660 
3661 static SeqEntryPtr SegsetFromSeqEntryList (SeqEntryPtr list)
3662 {
3663   SeqEntryPtr  first_sep, tmp_sep, next_sep;
3664   BioseqPtr    bsp;
3665   SeqDescrPtr  sdp = NULL, set_sdp;
3666   
3667   if (list == NULL)
3668   {
3669     return NULL;
3670   }
3671   
3672   first_sep = list;
3673   next_sep = first_sep->next;
3674   first_sep->next = NULL;
3675   
3676   /* grab title on first sequence to put on segmented bioseq */
3677   if (IS_Bioseq (first_sep) && first_sep->data.ptrvalue != NULL)
3678   {
3679     bsp = (BioseqPtr) first_sep->data.ptrvalue;
3680     sdp = bsp->descr;
3681     while (sdp != NULL && sdp->choice != Seq_descr_title)
3682     {
3683       sdp = sdp->next;
3684     }
3685   }
3686 
3687   while (next_sep != NULL)
3688   {
3689     tmp_sep = next_sep;
3690     next_sep = tmp_sep->next;
3691     tmp_sep->next = NULL;
3692     AddSeqEntryToSeqEntry (first_sep, tmp_sep, TRUE);
3693   }
3694   
3695   if (sdp != NULL && IS_Bioseq_set (first_sep))
3696   {
3697     tmp_sep = FindNucSeqEntry (first_sep);
3698     if (tmp_sep != NULL && IS_Bioseq (tmp_sep) && tmp_sep->data.ptrvalue != NULL)
3699     {
3700       bsp = tmp_sep->data.ptrvalue;
3701       set_sdp = bsp->descr;
3702       while (set_sdp != NULL && set_sdp->choice != Seq_descr_title)
3703       {
3704         set_sdp = set_sdp->next;
3705       }
3706       if (set_sdp == NULL)
3707       {
3708         set_sdp = CreateNewDescriptor (tmp_sep, Seq_descr_title);
3709       }
3710       if (set_sdp != NULL && StringHasNoText (set_sdp->data.ptrvalue))
3711       {
3712         /* make a copy, rather than removing the segment title */
3713         set_sdp->data.ptrvalue = MemFree (set_sdp->data.ptrvalue);
3714         set_sdp->data.ptrvalue = StringSave (sdp->data.ptrvalue);
3715       }
3716     }
3717   }  
3718   
3719   return first_sep;
3720 }
3721 
3722 static void ReplaceFakeIDWithIDFromTitle (BioseqPtr bsp);
3723 
3724 static SeqEntryPtr 
3725 ReadOneSegSet 
3726 (FILE            *fp,
3727  Boolean         parse_id,
3728  ValNodePtr PNTR err_msg_list,
3729  BoolPtr         chars_stripped)
3730 {
3731   SeqEntryPtr nextsep;
3732   CharPtr     errormsg = NULL;
3733   Char        lastchar;
3734   SeqEntryPtr seg_list = NULL, seg_list_last = NULL;
3735   BioseqPtr   bsp;
3736   
3737   if (fp == NULL)
3738   {
3739     return NULL;
3740   }
3741   
3742   /* note - we pass in FALSE for parse_id in SequinFastaToSeqEntryEx
3743    * because we do not want to use Sequin's auto-generated sequence IDs.
3744    * We then parse the sequence ID from the title ourselves using
3745    * ReplaceFakeIDWithIDFromTitle if parse_id is TRUE, or leave the ID
3746    * as blank to force the user to select a real ID later.
3747    */
3748   nextsep = SequinFastaToSeqEntryExEx (fp, TRUE, &errormsg, FALSE, &lastchar, chars_stripped);
3749   while (nextsep != NULL ||
3750          (lastchar != (Char) EOF && lastchar != NULLB && lastchar != (Char) 255
3751           && lastchar != ']')) 
3752   {
3753     if (nextsep != NULL) 
3754     {
3755       /* replace fake ID with ID from title */
3756       if (IS_Bioseq (nextsep) && nextsep->data.ptrvalue != NULL)
3757       {
3758         bsp = (BioseqPtr) nextsep->data.ptrvalue;
3759         if (parse_id)
3760         {
3761           ReplaceFakeIDWithIDFromTitle ((BioseqPtr) nextsep->data.ptrvalue);
3762         }
3763         else
3764         {
3765           bsp->id = SeqIdFree (bsp->id);
3766         }
3767       }
3768       SeqEntryPack (nextsep); 
3769       if (seg_list_last == NULL)
3770       {
3771         seg_list = nextsep;
3772       }
3773       else
3774       {
3775         seg_list_last->next = nextsep;
3776       }
3777       seg_list_last = nextsep;
3778       
3779       ValNodeAddPointer (err_msg_list, 0, errormsg);
3780       errormsg = NULL;
3781     }
3782     nextsep = SequinFastaToSeqEntryExEx (fp, TRUE, &errormsg, FALSE, &lastchar, chars_stripped);
3783   }
3784   nextsep = SegsetFromSeqEntryList (seg_list);
3785   return nextsep;
3786 }
3787 
3788 static void AddDefaultMoleculeTypeToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3789 {
3790   Int4    seq_num;
3791   CharPtr old_value;
3792   
3793   if (iatep == NULL)
3794   {
3795     return;
3796   }
3797   
3798   for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
3799   {
3800     if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
3801     {
3802       continue;
3803     }
3804     old_value = FindValueFromPairInDefline("moltype", 
3805                                            iatep->title_list [seq_num]);
3806     if (StringHasNoText (old_value) || StringICmp (old_value, "dna") == 0)
3807     {
3808       iatep->title_list [seq_num] = ReplaceValueInOneDefLine(iatep->title_list [seq_num],
3809                                                              "moltype",
3810                                                              "Genomic DNA");
3811     }
3812     old_value = MemFree (old_value);
3813   }
3814 }
3815 
3816 static void AddDefaultLocationToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3817 {
3818   Int4    seq_num;
3819   CharPtr old_value, first_organism, next_org_loc = NULL, org_stop;
3820   
3821   if (iatep == NULL)
3822   {
3823     return;
3824   }
3825   
3826   for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
3827   {
3828     if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
3829     {
3830       continue;
3831     }
3832     first_organism = FindValuePairInDefLine ("organism", iatep->title_list [seq_num], &org_stop);
3833     if (first_organism != NULL)
3834     {
3835       next_org_loc = FindValuePairInDefLine ("organism", org_stop + 1, NULL);
3836     }
3837     else
3838     {
3839       next_org_loc = NULL;
3840     }
3841     old_value = FindValueFromPairInDeflineBeforeCharPtr ("location", 
3842                                                          iatep->title_list [seq_num],
3843                                                          next_org_loc);
3844     if (StringHasNoText (old_value))
3845     {
3846       iatep->title_list [seq_num] = ReplaceValueInOneDefLineForOrganism (iatep->title_list [seq_num],
3847                                                                          "location",
3848                                                                          "genomic",
3849                                                                          first_organism);
3850     }
3851     old_value = MemFree (old_value);
3852   }
3853 }
3854 
3855 static void AddDefaultTopologyToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3856 {
3857   Int4    seq_num;
3858   CharPtr old_value;
3859   
3860   if (iatep == NULL)
3861   {
3862     return;
3863   }
3864   
3865   for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
3866   {
3867     if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
3868     {
3869       continue;
3870     }
3871     old_value = FindValueFromPairInDefline ("topology", 
3872                                             iatep->title_list [seq_num]);
3873     if (StringHasNoText (old_value))
3874     {
3875       iatep->title_list [seq_num] = ReplaceValueInOneDefLine(iatep->title_list [seq_num],
3876                                                              "topology",
3877                                                              "Linear");
3878     }
3879     old_value = MemFree (old_value);
3880   }
3881 }
3882 
3883 static void AddDefaultGeneticCodesToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3884 {
3885   CharPtr     taxname, location, gcode_name;
3886   Int4        gcode;
3887   ValNodePtr  gencodelist;
3888   Int4        seq_num;
3889   CharPtr     first_organism, next_org_loc = NULL, org_stop;
3890 
3891   if (iatep == NULL)
3892   {
3893     return;
3894   }
3895   
3896   gencodelist = GetGeneticCodeValNodeList ();
3897   for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
3898   {
3899     if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
3900     {
3901       continue;
3902     }
3903     first_organism = FindValuePairInDefLine ("organism", iatep->title_list [seq_num], &org_stop);
3904     if (first_organism != NULL)
3905     {
3906       next_org_loc = FindValuePairInDefLine ("organism", org_stop + 1, NULL);
3907     }
3908     else
3909     {
3910       next_org_loc = NULL;
3911     }
3912     
3913     taxname = FindValueFromPairInDefline ("organism", first_organism);
3914     location = FindValueFromPairInDeflineBeforeCharPtr ("location", 
3915                                                         iatep->title_list [seq_num],
3916                                                         next_org_loc);
3917     if (StringHasNoText (location))
3918     {
3919       location = StringSave ("genomic");
3920     }
3921     
3922     gcode = GetGeneticCodeForTaxNameAndLocation (taxname, location);
3923     taxname = MemFree (taxname);
3924     location = MemFree (location);
3925     
3926     if (gcode < 0)
3927     {
3928       gcode_name = FindValueFromPairInDeflineBeforeCharPtr ("genetic_code",
3929                                                             iatep->title_list [seq_num],
3930                                                             next_org_loc);
3931       if (StringHasNoText (gcode_name))
3932       {
3933         gcode_name = MemFree (gcode_name);
3934         gcode_name = GeneticCodeStringFromIntAndList (1, gencodelist);
3935         iatep->title_list [seq_num] = ReplaceValueInOneDefLineForOrganism (iatep->title_list [seq_num],
3936                                                                          "genetic_code",
3937                                                                          gcode_name,
3938                                                                          first_organism);
3939       }
3940       else
3941       {
3942         gcode_name = MemFree (gcode_name);
3943       }
3944     }
3945     else
3946     {
3947       gcode_name = GeneticCodeStringFromIntAndList (gcode, gencodelist);
3948       iatep->title_list [seq_num] = ReplaceValueInOneDefLineForOrganism (iatep->title_list [seq_num],
3949                                                                          "genetic_code",
3950                                                                          gcode_name,
3951                                                                          first_organism);
3952     }
3953   }
3954   ValNodeFreeData (gencodelist);
3955 }
3956 
3957 static void AddDefaultModifierValues (SeqEntryPtr seq_list)
3958 {
3959   IDAndTitleEditPtr iatep;
3960   
3961   iatep = SeqEntryListToIDAndTitleEdit (seq_list);
3962   AddDefaultMoleculeTypeToIDAndTitleEdit (iatep);
3963   AddDefaultLocationToIDAndTitleEdit (iatep);
3964   AddDefaultTopologyToIDAndTitleEdit (iatep);
3965   AddDefaultGeneticCodesToIDAndTitleEdit (iatep);
3966   ApplyIDAndTitleEditToSeqEntryList (seq_list, iatep);
3967   iatep = IDAndTitleEditFree (iatep);
3968 }
3969 
3970 static Boolean HasGapID (SeqEntryPtr sep)
3971 {
3972   BioseqPtr bsp;
3973   Char      id_str [128];
3974   Int4      j;
3975   
3976   if (sep == NULL || ! IS_Bioseq (sep) || (bsp = sep->data.ptrvalue) == NULL)
3977   {
3978     return FALSE;
3979   }
3980   
3981   SeqIdWrite (bsp->id, id_str, PRINTID_REPORT, sizeof (id_str));
3982   
3983   if (id_str [0] != '?')
3984   {
3985     return FALSE;
3986   }
3987   if (StringICmp (id_str + 1, "unk100") == 0)
3988   {
3989     return TRUE;
3990   }
3991   else 
3992   {
3993     /* make sure there are only numbers after the question mark */
3994     j = 1;
3995     while (isdigit (id_str [j]))
3996     {
3997       j++;
3998     }
3999     if (id_str [j] == 0)
4000     {
4001       return TRUE;
4002     }
4003     else
4004     {
4005       return FALSE;
4006     }
4007   }
4008 }
4009 
4010 static Boolean HasNoSeqID (SeqEntryPtr sep)
4011 {
4012   BioseqPtr bsp;
4013 
4014   if (sep == NULL || ! IS_Bioseq (sep) || (bsp = sep->data.ptrvalue) == NULL)
4015   {
4016     return FALSE;
4017   }
4018   if (bsp->id == NULL)
4019   {
4020     return TRUE;
4021   }
4022   else
4023   {
4024     return FALSE;
4025   }
4026 }
4027 
4028 static void PutDeflineIDBackInTitle (BioseqPtr bsp)
4029 {
4030   SeqDescrPtr sdp;
4031   CharPtr     id_txt;
4032   CharPtr     title_txt;
4033   
4034   if (bsp == NULL || bsp->id == NULL)
4035   {
4036     return;
4037   }
4038 
4039   sdp = bsp->descr;
4040   while (sdp != NULL && sdp->choice != Seq_descr_title)
4041   {
4042     sdp = sdp->next;
4043   }
4044   if (sdp == NULL)
4045   {
4046     sdp = CreateNewDescriptorOnBioseq (bsp, Seq_descr_title);
4047   }
4048   if (sdp == NULL)
4049   {
4050     return;
4051   }
4052   
4053   id_txt = SeqIdWholeLabel (bsp->id, PRINTID_REPORT);
4054   
4055   if (StringHasNoText (sdp->data.ptrvalue))
4056   {
4057     sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
4058     sdp->data.ptrvalue = id_txt;
4059   }
4060   else
4061   {
4062     title_txt = (CharPtr) MemNew (sizeof (Char) * (StringLen (id_txt) + StringLen (sdp->data.ptrvalue) + 2));
4063     StringCpy (title_txt, id_txt);
4064     StringCat (title_txt, " ");
4065     StringCat (title_txt, sdp->data.ptrvalue);
4066     sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
4067     sdp->data.ptrvalue = title_txt;
4068     id_txt = MemFree (id_txt);
4069   }
4070   
4071   bsp->id = SeqIdFree (bsp->id);  
4072 }
4073 
4074 static Char GetNextCharacterFromFile (FILE *fp, BoolPtr pIsASN)
4075 {
4076   FileCache    fc;
4077   CharPtr      str;
4078   Char         special_symbol;
4079   Char         line [128];
4080   Int4         pos;
4081   
4082   /* look ahead to see what character caused inability to interpret line */
4083   FileCacheSetup (&fc, fp);
4084   /* pos = FileCacheTell (&fc); */
4085   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4086   if (str != NULL && StringDoesHaveText (str)) {
4087     TrimSpacesAroundString (str);
4088   }
4089   special_symbol = line [0];
4090   if (pIsASN != NULL)
4091   {
4092     if (StringStr (line, "::=") != NULL)
4093     {
4094       *pIsASN = TRUE;
4095     }
4096     else
4097     {
4098       *pIsASN = FALSE;
4099     }
4100   }
4101   /* seek to start of next line after one that could not be interpreted */
4102   pos = FileCacheTell (&fc);
4103   FileCacheSetup (&fc, fp);
4104   FileCacheSeek (&fc, pos);
4105   fseek (fp, pos, SEEK_SET);
4106   return special_symbol;
4107 }
4108 
4109 static Int4 FindLineForStartOfBadRead (FILE *fp, Int4 pos)
4110 {
4111   FileCache    fc;
4112   Int4         line_num = 0;
4113   Char         line [4096];
4114   CharPtr      str;
4115   
4116   if (fp == NULL || pos == 0) {
4117     return 0;
4118   }
4119 
4120   FileCacheSetup (&fc, fp);
4121   FileCacheSeek (&fc, 0);
4122   fseek (fp, 0, SEEK_SET);
4123   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4124   while (str != NULL && FileCacheTell (&fc) < pos) {
4125       line_num++;
4126       str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4127   }
4128   return line_num;
4129 }
4130 
4131 
4132 static Int4 FindLineForBadReadChar (FILE *fp, Char badchar)
4133 {
4134   FileCache    fc;
4135   Int4         line_num = 0;
4136   Char         line [4096];
4137   CharPtr      str;
4138   
4139   if (fp == NULL) {
4140     return 0;
4141   }
4142 
4143   FileCacheSetup (&fc, fp);
4144   FileCacheSeek (&fc, 0);
4145   fseek (fp, 0, SEEK_SET);
4146   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4147   while (str != NULL && StringChr (str, badchar) == NULL) {
4148       line_num++;
4149       str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4150   }
4151   return line_num;
4152 }
4153 
4154 
4155 static void CleanTitles (SeqEntryPtr sep, ValNodePtr PNTR special_list)
4156 {
4157   BioseqPtr    bsp;
4158   BioseqSetPtr bssp;
4159   SeqDescrPtr  sdp = NULL;
4160 
4161   while (sep != NULL)
4162   {
4163     sdp = NULL;
4164     if (sep->choice == 1)
4165     {
4166       bsp = sep->data.ptrvalue;
4167       if (bsp != NULL) 
4168       {
4169         sdp = bsp->descr;
4170       }
4171     }
4172     else if (sep->choice == 2)
4173     {
4174       bssp = sep->data.ptrvalue;
4175       if (bssp != NULL)
4176       {
4177         CleanTitles (bssp->seq_set, special_list);
4178         sdp = bssp->descr;
4179       }
4180     }
4181     while (sdp != NULL)
4182     {
4183       if (sdp->choice == Seq_descr_title)
4184       {
4185         SpecialCharFindWithContext ((CharPtr PNTR) &(sdp->data.ptrvalue), special_list, NULL, NULL);
4186       }
4187       sdp = sdp->next;
4188     }
4189     sep = sep->next;
4190   }
4191 }
4192 
4193 
4194 static SeqEntryPtr ImportOnlyProteinSequences 
4195 (FILE            *fp,
4196  SeqEntryPtr     sep_list,
4197  Boolean         parse_id,
4198  CharPtr         supplied_id_txt,
4199  ValNodePtr PNTR err_msg_list,
4200  BoolPtr         chars_stripped)
4201 {
4202   Pointer       dataptr;
4203   Uint2         datatype;
4204   SeqEntryPtr   new_list = NULL, nextsep, lastsep = NULL, oldscope;
4205   Boolean       error_reading = FALSE;
4206   Int4          pos, bad_start;
4207   BioseqPtr     bsp;
4208   ValNodePtr    special_list = NULL;
4209 
4210   oldscope = SeqEntrySetScope (NULL);
4211 
4212   pos = ftell (fp);  
4213   dataptr = ReadAsnFastaOrFlatFileEx (fp, &datatype, NULL, FALSE, TRUE,
4214                                     FALSE, FALSE, chars_stripped);
4215   while (dataptr != NULL) 
4216   {
4217     bsp = NULL;
4218     if (datatype == OBJ_SEQENTRY)
4219     {
4220       nextsep = (SeqEntryPtr) dataptr;
4221       if (IS_Bioseq(nextsep)) {
4222         bsp = nextsep->data.ptrvalue;
4223       } else {
4224         nextsep = NULL;
4225       }      
4226     }
4227     else if (datatype == OBJ_BIOSEQ)
4228     {
4229       bsp = dataptr;
4230       nextsep = SeqMgrGetSeqEntryForData(bsp);
4231     }
4232     if (bsp != NULL)
4233     {
4234       if (parse_id) 
4235       {
4236         ReplaceFakeIDWithIDFromTitle ((BioseqPtr) nextsep->data.ptrvalue);
4237       } 
4238       else 
4239       {
4240         if (!StringHasNoText (supplied_id_txt))
4241         {
4242           bsp->id = MakeSeqID (supplied_id_txt);
4243         }
4244       } 
4245       SeqEntryPack (nextsep);
4246       if (lastsep == NULL) {
4247         new_list = nextsep;
4248       } else {
4249         lastsep->next = nextsep;
4250       }
4251       lastsep = nextsep;
4252       pos = ftell (fp);  
4253       dataptr = ReadAsnFastaOrFlatFileEx (fp, &datatype, NULL, FALSE, TRUE,
4254                                           FALSE, FALSE, chars_stripped);
4255     }
4256     else
4257     {
4258       if (dataptr != NULL) {
4259         error_reading = TRUE;
4260       }
4261       dataptr = NULL;
4262     }
4263   }
4264   if (*chars_stripped || error_reading)
4265   {
4266     bad_start = FindLineForStartOfBadRead (fp, pos);
4267     Message (MSG_ERROR, "Unable to read file, starting at line %d!", bad_start);
4268     new_list = SeqEntryFree (new_list);
4269   }
4270   else
4271   {
4272     CleanTitles (new_list, &special_list);
4273     if (!FixSpecialCharactersForStringsInList (special_list, "Definition lines contain special characters.\nThe sequences cannot be imported unless the characters are replaced.", FALSE))
4274     {
4275       new_list = SeqEntryFree (new_list);
4276     }
4277     special_list = FreeContextList (special_list);
4278   }
4279   
4280   lastsep = sep_list;
4281   while (lastsep != NULL && lastsep->next != NULL) 
4282   {
4283     lastsep = lastsep->next;
4284   }
4285   if (lastsep == NULL)
4286   {
4287     sep_list = new_list;
4288   }
4289   else
4290   {
4291     lastsep->next = new_list;
4292   }
4293 
4294   SeqEntrySetScope (oldscope);
4295 
4296   return sep_list;
4297   
4298 }
4299  
4300 extern SeqEntryPtr 
4301 ImportSequencesFromFileEx
4302 (FILE           *fp, 
4303  SeqEntryPtr     sep_list,
4304  Boolean         is_na, 
4305  Boolean         parse_id,
4306  CharPtr         supplied_id_txt,
4307  ValNodePtr PNTR err_msg_list,
4308  BoolPtr         chars_stripped,
4309  Boolean         allow_char_stripping)
4310 {
4311   Int4          count;
4312   SeqEntryPtr   last;
4313   Char          lastchar;
4314   SeqEntryPtr   nextsep;
4315   CharPtr       errormsg = NULL;
4316   BioseqPtr     bsp = NULL;
4317   SeqEntryPtr   new_sep_list = NULL;
4318   ErrSev        oldsev;
4319   Boolean       read_from_delta;
4320   SeqEntryPtr   oldscope;
4321   Int4          pos, last_no_id_start = -1;
4322   Boolean       this_chars_stripped = FALSE;
4323   Boolean       isASN = FALSE, isOnlyFASTA = FALSE;
4324   Int4          bad_start = 0, bad_line = 0;
4325   ValNodePtr    special_list = NULL;
4326   
4327   if (chars_stripped != NULL)
4328   {
4329     *chars_stripped = FALSE;
4330   }
4331   
4332   if (!is_na) {
4333     return ImportOnlyProteinSequences (fp, sep_list, parse_id, supplied_id_txt, err_msg_list, chars_stripped);
4334   }
4335   
4336   count = 0;
4337   
4338   new_sep_list = NULL;
4339   last = NULL;
4340   
4341   oldscope = SeqEntrySetScope (NULL);
4342   
4343   pos = ftell (fp);
4344   
4345   bsp = NULL;
4346   oldsev = ErrSetMessageLevel (SEV_MAX);
4347   bsp = ReadDeltaFastaEx (fp, NULL, &this_chars_stripped);
4348   if (chars_stripped != NULL)
4349   {
4350     *chars_stripped |= this_chars_stripped;
4351   }
4352   if (bsp != NULL && !parse_id)
4353   {
4354     PutDeflineIDBackInTitle (bsp);
4355     if (!StringHasNoText (supplied_id_txt))
4356     {
4357       bsp->id = MakeSeqID (supplied_id_txt);
4358     }
4359   }
4360   ErrSetMessageLevel (oldsev);
4361   
4362   /* note - we pass in FALSE for parse_id in SequinFastaToSeqEntryEx
4363    * because we do not want to use Sequin's auto-generated sequence IDs.
4364    * We then parse the sequence ID from the title ourselves using
4365    * ReplaceFakeIDWithIDFromTitle if parse_id is TRUE, or leave the ID
4366    * as blank to force the user to select a real ID later.
4367    */
4368   
4369   if (bsp == NULL)
4370   {
4371     bsp = ReadFastaOnly (fp,
4372                          is_na, !is_na,
4373                          &this_chars_stripped,
4374                          &lastchar);
4375     if (bsp == NULL)
4376     {
4377       if (lastchar == 0)
4378       {
4379         lastchar = GetNextCharacterFromFile (fp, &isASN);
4380       }
4381     } else {
4382       isOnlyFASTA = TRUE;
4383     }
4384 
4385     nextsep = SeqMgrGetSeqEntryForData (bsp);
4386     if (chars_stripped != NULL)
4387     {
4388       *chars_stripped |= this_chars_stripped;
4389     }
4390     read_from_delta = FALSE;
4391   }
4392   else
4393   {
4394     nextsep = SeqMgrGetSeqEntryForData (bsp);
4395     lastchar = '\n';
4396     read_from_delta = TRUE;
4397   }
4398   while ((nextsep != NULL ||
4399          (lastchar == '\n' || lastchar == '['))
4400          && !isASN
4401          && (allow_char_stripping || !this_chars_stripped))
4402   {
4403     if (nextsep != NULL) 
4404     {
4405       if (!read_from_delta
4406           && IS_Bioseq (nextsep) 
4407           && nextsep->data.ptrvalue != NULL)
4408       {
4409         bsp = (BioseqPtr) nextsep->data.ptrvalue;
4410         /* replace fake ID with ID from title for sequences that aren't deltas */
4411         if (parse_id)
4412         {
4413           ReplaceFakeIDWithIDFromTitle ((BioseqPtr) nextsep->data.ptrvalue);
4414         }
4415         else
4416         {
4417           bsp->id = SeqIdFree (bsp->id);
4418           if (!StringHasNoText (supplied_id_txt))
4419           {
4420                 bsp->id = MakeSeqID (supplied_id_txt);
4421           }
4422         }
4423         SeqEntryPack (nextsep);
4424       }
4425       
4426       if (last_no_id_start > -1)
4427       {
4428         if (HasGapID (nextsep))
4429         {
4430           nextsep = SeqEntryFree (nextsep);
4431           bsp = last->data.ptrvalue;
4432           SeqMgrDeleteFromBioseqIndex (bsp);
4433           bsp = BioseqFree (bsp);
4434           fseek (fp, last_no_id_start, SEEK_SET);
4435           bsp = ReadDeltaFastaWithEmptyDefline (fp, NULL, chars_stripped);
4436           last->data.ptrvalue = bsp;
4437           bsp->id = SeqIdFree (bsp->id);
4438           last_no_id_start = -1;
4439         }
4440         else if (HasNoSeqID (nextsep))
4441         {
4442           last_no_id_start = pos;
4443         }
4444         else
4445         {
4446           last_no_id_start = -1;
4447         }
4448       }
4449       else if (HasNoSeqID (nextsep))
4450       {
4451         last_no_id_start = pos;
4452       }
4453       
4454       ValNodeAddPointer (err_msg_list, 0, errormsg);
4455       errormsg = NULL;
4456     }
4457     else if (lastchar == '[')
4458     {
4459       nextsep = ReadOneSegSet (fp, parse_id, err_msg_list, &this_chars_stripped);
4460       if (chars_stripped != NULL)
4461       {
4462         *chars_stripped |= this_chars_stripped;
4463       }      
4464     }
4465     if (nextsep != NULL)
4466     {
4467       if (last == NULL) 
4468       {
4469         new_sep_list = nextsep;
4470         last = nextsep;
4471       }
4472       else 
4473       {
4474         last->next = nextsep;
4475         last = nextsep;
4476       }
4477     } 
4478     
4479     pos = ftell (fp);
4480     bsp = NULL;
4481     if (isOnlyFASTA) 
4482     {
4483       lastchar = (Char) EOF;
4484       nextsep = NULL;
4485     }     
4486     else if (is_na)
4487     {
4488       oldsev = ErrSetMessageLevel (SEV_MAX);
4489       bsp = ReadDeltaFastaEx (fp, NULL, &this_chars_stripped);
4490       if (chars_stripped != NULL)
4491       {
4492         *chars_stripped |= this_chars_stripped;
4493       }      
4494       ErrSetMessageLevel (oldsev);
4495       if (!parse_id)
4496       {
4497         PutDeflineIDBackInTitle (bsp);
4498       }
4499     }
4500     
4501     if (isOnlyFASTA) {
4502       /* done with loop */
4503     }
4504     else if (bsp == NULL)
4505     {
4506       bsp = ReadFastaOnly (fp,
4507                            is_na, !is_na,
4508                            &this_chars_stripped,
4509                            &lastchar);
4510       if (bsp == NULL)
4511       {
4512         if (lastchar == 0)
4513         {
4514           lastchar = GetNextCharacterFromFile (fp, &isASN);
4515         }
4516       } 
4517       else 
4518       {
4519         isOnlyFASTA = TRUE;
4520       }
4521       nextsep = SeqMgrGetSeqEntryForData (bsp);
4522       if (chars_stripped != NULL)
4523       {
4524         *chars_stripped |= this_chars_stripped;
4525       }      
4526       read_from_delta = FALSE;
4527     }
4528     else
4529     {
4530       nextsep = SeqMgrGetSeqEntryForData (bsp);
4531       lastchar = '\n';
4532       read_from_delta = TRUE;
4533     }
4534   }
4535   
4536   if ((!allow_char_stripping && this_chars_stripped) || (lastchar != (Char) EOF && lastchar != NULLB && lastchar != (Char) 255))
4537   {
4538     if (!this_chars_stripped && !isASN) {
4539       bad_start = FindLineForStartOfBadRead (fp, pos);
4540       bad_line = FindLineForBadReadChar (fp, lastchar);
4541       Message (MSG_ERROR, "Unable to read file, starting at line %d (found bad character '%c' at line %d)!", bad_start, lastchar, bad_line);
4542     }
4543     new_sep_list = SeqEntryFree (new_sep_list);
4544   }
4545   else
4546   {
4547     CleanTitles (new_sep_list, &special_list);
4548     if (!FixSpecialCharactersForStringsInList (special_list, "Definition lines contain special characters.\nThe sequences cannot be imported unless the characters are replaced.", FALSE))
4549     {
4550       new_sep_list = SeqEntryFree (new_sep_list);
4551     }
4552     special_list = FreeContextList (special_list);
4553   }
4554   
4555   last = sep_list;
4556   while (last != NULL && last->next != NULL) 
4557   {
4558     last = last->next;
4559   }
4560   if (last == NULL)
4561   {
4562     sep_list = new_sep_list;
4563   }
4564   else
4565   {
4566     last->next = new_sep_list;
4567   }
4568 
4569   SeqEntrySetScope (oldscope);
4570 
4571   return sep_list;
4572 }
4573 
4574 
4575 extern SeqEntryPtr 
4576 ImportSequencesFromFile
4577 (FILE           *fp, 
4578  SeqEntryPtr     sep_list,
4579  Boolean         is_na, 
4580  Boolean         parse_id,
4581  CharPtr         supplied_id_txt,
4582  ValNodePtr PNTR err_msg_list,
4583  BoolPtr         chars_stripped)
4584 {
4585   return ImportSequencesFromFileEx (fp, sep_list, is_na, parse_id, supplied_id_txt, err_msg_list, chars_stripped, FALSE);
4586 }
4587 
4588 
4589 static Boolean CollectIDsAndTitles (SeqEntryPtr new_list, SeqEntryPtr current_list, Boolean is_nuc);
4590 
4591 static SeqEntryPtr RemoveZeroLengthSequences (SeqEntryPtr list, Int4Ptr pnum_seqs, Int4Ptr pnum_zero)
4592 {
4593   SeqEntryPtr  prev_sep, next_sep, this_sep;
4594   Int4         num_seqs = 0, num_zero = 0;
4595   BioseqPtr    bsp;
4596   BioseqSetPtr bssp;
4597   
4598   if (list == NULL)
4599   {
4600     return NULL;
4601   }
4602 
4603   prev_sep = NULL;
4604   this_sep = list;
4605   while (this_sep != NULL)
4606   {
4607     num_seqs++;
4608     next_sep = this_sep->next;
4609     if (this_sep->data.ptrvalue == NULL)
4610     {
4611       num_zero++;
4612       if (prev_sep == NULL)
4613       {
4614         list = next_sep;
4615       }
4616       else
4617       {
4618         prev_sep->next = next_sep;
4619       }
4620       this_sep->next = NULL;
4621       SeqEntryFree (this_sep);
4622     }
4623     else if (IS_Bioseq (this_sep))
4624     {
4625       bsp = (BioseqPtr) this_sep->data.ptrvalue;
4626       if (bsp->length == 0)
4627       {
4628         num_zero++;
4629         
4630         if (prev_sep == NULL)
4631         {
4632           list = next_sep;
4633         }
4634         else
4635         {
4636           prev_sep->next = next_sep;
4637         }
4638         this_sep->next = NULL;
4639         SeqEntryFree (this_sep);
4640       }
4641       else
4642       {
4643         prev_sep = this_sep;
4644       }
4645     }
4646     else if (IS_Bioseq_set (this_sep))
4647     {
4648       bssp = (BioseqSetPtr) this_sep->data.ptrvalue;
4649       bssp->seq_set = RemoveZeroLengthSequences (bssp->seq_set, pnum_seqs, pnum_zero);
4650       if (bssp->seq_set == NULL)
4651       {
4652         num_zero++;
4653         if (prev_sep == NULL)
4654         {
4655           list = next_sep;
4656         }
4657         else
4658         {
4659           prev_sep->next = next_sep;
4660         }
4661         this_sep->next = NULL;
4662         SeqEntryFree (this_sep);
4663       }
4664       else
4665       {
4666         prev_sep = this_sep;
4667       }
4668     }
4669     else
4670     {
4671       prev_sep = this_sep;
4672     }
4673     this_sep = next_sep;
4674   }
4675   
4676   if (pnum_seqs != NULL)
4677   {
4678     *pnum_seqs += num_seqs;
4679   }
4680   if (pnum_zero != NULL)
4681   {
4682     *pnum_zero += num_zero;
4683   }
4684   return list;
4685 }
4686 
4687 static Boolean RejectZeroLengthSequences (SeqEntryPtr PNTR new_list)
4688 {
4689   SeqEntryPtr next_sep;
4690   Int4        num_zero = 0, num_seq = 0;
4691   Boolean     rval = TRUE;
4692   Boolean     delete_all = FALSE;
4693   
4694   if (new_list == NULL)
4695   {
4696     return FALSE;
4697   }
4698   
4699   *new_list = RemoveZeroLengthSequences (*new_list, &num_seq, &num_zero);
4700 
4701   if (num_zero > 0)
4702   {
4703     ResetSegSetIDLists (*new_list);
4704     if (num_zero == num_seq)
4705     {
4706       Message (MSG_ERROR, "The sequences in your file are empty - you cannot import them.");
4707       delete_all = TRUE;
4708       rval = FALSE;
4709     }
4710     else if (ANS_CANCEL == Message (MSG_OKC, "%d sequences in your file are empty and cannot be imported.  "
4711                                     "Would you like to import the remaining sequences?", num_zero))
4712     {
4713       delete_all = TRUE;
4714       rval = FALSE;
4715     }
4716     if (delete_all)
4717     {
4718       
4719       while ((*new_list) != NULL)
4720       {
4721         next_sep = (*new_list)->next;
4722         (*new_list)->next = NULL;
4723         SeqEntryFree (*new_list);
4724         *new_list = next_sep;
4725       }
4726     }
4727   }
4728   return rval;
4729 }
4730 
4731 static Boolean RejectExtraSequences (SeqEntryPtr new_list, FastaPagePtr fpp)
4732 {
4733   SeqEntryPtr sep, next_sep;
4734   
4735   if (new_list == NULL || fpp == NULL)
4736   {
4737     return FALSE;
4738   }
4739   else if (!fpp->single || new_list->next == NULL)
4740   {
4741     return TRUE;
4742   }
4743 
4744   if (fpp->is_na 
4745            && fpp->seqPackagePtr != NULL 
4746            && *(fpp->seqPackagePtr) != SEQ_PKG_GENOMICCDNA)
4747   {
4748     if (Message (MSG_YN, "You are importing multiple sequences - did you intend to create a batch submission?") == ANS_YES)
4749     {
4750       *(fpp->seqPackagePtr) = SEQ_PKG_GENBANK;
4751       fpp->single = FALSE;
4752       SafeHide (fpp->singleIdGrp);
4753       return TRUE;
4754     }
4755   }
4756   if (Message (MSG_YN, "You cannot import multiple sequences - import the first one and ignore the rest?") == ANS_YES)
4757   {
4758     sep = new_list->next;
4759     new_list->next = NULL;
4760     while (sep != NULL)
4761     {
4762       next_sep = sep->next;
4763       sep->next = NULL;
4764       sep = SeqEntryFree (sep);
4765       sep = next_sep;
4766     }
4767     return TRUE;
4768   }
4769   else
4770   {
4771     return FALSE;
4772   }
4773 }
4774 
4775 static void ShowImportHelp (ButtoN b)
4776 {
4777   CharPtr help_msg;
4778   
4779   help_msg = (CharPtr) GetObjectExtra (b);
4780   if (help_msg == NULL)
4781   {
4782     return;
4783   }
4784   
4785   Message (MSG_OK, help_msg);
4786 }
4787 
4788 static Boolean OkToImport (CharPtr msg, CharPtr help_msg)
4789 {
4790   WindoW w;
4791   GrouP  h, c;
4792   PrompT p;
4793   ButtoN b;
4794   ModalAcceptCancelData acd;
4795   
4796   if (msg == NULL)
4797   {
4798     return TRUE;
4799   }
4800   acd.accepted = FALSE;
4801   acd.cancelled = FALSE;
4802   
4803   w = ModalWindow(-20, -13, -10, -10, NULL);
4804   h = HiddenGroup (w, -1, 0, NULL);
4805   
4806   p = StaticPrompt (h, msg, 0, 0, programFont, 'l');
4807   c = HiddenGroup (h, 3, 0, NULL);
4808   b = PushButton (c, "Yes", ModalAcceptButton);
4809   SetObjectExtra (b, &acd, NULL);
4810   b = PushButton (c, "No", ModalCancelButton);
4811   SetObjectExtra (b, &acd, NULL);
4812   if (help_msg != NULL)
4813   {
4814     b = PushButton (c, "Help", ShowImportHelp);
4815     SetObjectExtra (b, help_msg, NULL);
4816   }
4817   AlignObjects (ALIGN_CENTER, (HANDLE) p, (HANDLE) c, NULL);
4818   
4819   Show(w); 
4820   Select (w);
4821   while (!acd.accepted && ! acd.cancelled)
4822   {
4823     ProcessExternalEvent ();
4824     Update ();
4825   }
4826   ProcessAnEvent ();
4827   Remove (w);
4828   if (acd.accepted)
4829   {
4830     return TRUE;
4831   }
4832   else
4833   {
4834     return FALSE;
4835   }
4836 }
4837 
4838 static CharPtr segset_import_help_str = "Segmented sequence: a collection of non-overlapping, non-contiguous sequences that cover a specified genetic region. A standard example is a set of genomic DNA sequences that encode exons from a gene along with fragments of their flanking introns.";
4839 static CharPtr gapped_import_help_str = "Gapped sequence: a sequence with one or more gaps of known or unknown length.";
4840 
4841 
4842 static Boolean ImportedSequenceTypeOk (SeqEntryPtr list, Int2 seqPackage)
4843 {
4844   BioseqPtr bsp;
4845   Boolean   rval = TRUE;
4846   
4847   if (list == NULL || seqPackage != SEQ_PKG_SINGLE)
4848   {
4849     return TRUE;
4850   }
4851   if (list->choice == 1)
4852   {
4853     bsp = (BioseqPtr) list->data.ptrvalue;
4854     if (bsp != NULL && bsp->repr == Seq_repr_delta)
4855     {
4856       SendHelpScrollMessage (helpForm, "Sequence Format Form", NULL);
4857       rval = OkToImport ("You have imported a gapped sequence.  Did you mean to do that?",
4858                          gapped_import_help_str);
4859     }
4860   }
4861   else if (list->choice == 2)
4862   {
4863     SendHelpScrollMessage (helpForm, "Sequence Format Form", NULL);
4864     rval = OkToImport ("You have imported a segmented sequence.  Did you mean to do that?",
4865                        segset_import_help_str);
4866   }
4867   return rval;
4868 }
4869 
4870 static Boolean ImportFastaDialog (DialoG d, CharPtr filename)
4871 
4872 {
4873   CharPtr       extension;
4874   FILE          *f;
4875   FastaPagePtr  fpp;
4876   ValNodePtr    head;
4877   Char          path [PATH_MAX];
4878   RecT          r;
4879   SeqEntryPtr   sep, new_sep_list, new_sep, test_sep;
4880   Boolean       rval = FALSE;
4881   BioseqPtr     bsp;
4882   CharPtr       supplied_id_txt = NULL;
4883   Boolean       chars_stripped = FALSE;
4884 
4885   path [0] = '\0';
4886   StringNCpy_0 (path, filename, sizeof (path));
4887   fpp = (FastaPagePtr) GetObjectExtra (d);
4888   if (fpp != NULL) {
4889     if (fpp->list != NULL && fpp->single)
4890     {
4891       if (!fpp->is_na
4892           || fpp->seqPackagePtr == NULL
4893           || *fpp->seqPackagePtr == SEQ_PKG_GENOMICCDNA)
4894       {
4895         Message (MSG_ERROR, "Can't import additional sequences!");
4896         return FALSE;
4897       }
4898       else
4899       {
4900         if (Message (MSG_YN, "You are importing multiple sequences - did you intend to create a batch submission?") == ANS_NO)
4901         {
4902           Message (MSG_ERROR, "Can't import additional sequences!");
4903           return FALSE;
4904         }
4905         else
4906         {
4907           *(fpp->seqPackagePtr) = SEQ_PKG_GENBANK;
4908           fpp->single = FALSE;
4909           SafeHide (fpp->singleIdGrp);
4910         }
4911       }
4912     }
4913     extension = NULL;
4914     if (fpp->is_mrna) {
4915       extension = GetAppProperty ("FastaNucExtension");
4916     } else if (fpp->is_na) {
4917       extension = GetAppProperty ("FastaNucExtension");
4918     } else {
4919       extension = GetAppProperty ("FastaProtExtension");
4920     }
4921     if (path [0] != '\0' || GetInputFileName (path, sizeof (path), extension, "TEXT")) {
4922       WatchCursor ();
4923       StringCpy (fpp->path, path);
4924       ObjectRect (fpp->doc, &r);
4925       InsetRect (&r, 4, 4);
4926       faColFmt.pixWidth = r.right - r.left;
4927       /*
4928       ResetFastaPage (fpp);
4929       */
4930       Reset (fpp->doc);
4931       Update ();
4932       sep = fpp->list;
4933       head = fpp->errmsgs;
4934       f = FileOpen (fpp->path, "r");
4935       if (f == NULL)
4936       {
4937         Message (MSG_ERROR, "Unable to open %s", fpp->path);
4938         fpp->path[0] = 0;
4939       }
4940       else
4941       {
4942         if (fpp->singleSeqID != NULL)
4943         {
4944           supplied_id_txt = SaveStringFromText (fpp->singleSeqID);
4945         }
4946         new_sep_list = ImportSequencesFromFile (f, NULL, fpp->is_na, 
4947                                                 fpp->parseSeqId, 
4948                                                 supplied_id_txt,
4949                                                 &head, &chars_stripped);
4950         if (chars_stripped && new_sep_list != NULL)
4951         {
4952           if (ANS_CANCEL == Message (MSG_OKC, "Illegal characters will be stripped from your sequence data.  Do you want to continue?"))
4953           {
4954             new_sep_list = SeqEntryFree (new_sep_list);
4955             FileClose (f);
4956             fpp->path [0] = 0;
4957             ArrowCursor ();
4958             Update ();
4959             return FALSE;
4960           }
4961         }
4962         supplied_id_txt = MemFree (supplied_id_txt);                                              
4963         if (fpp->seqPackagePtr != NULL 
4964             && *(fpp->seqPackagePtr) == SEQ_PKG_SEGMENTED
4965             && new_sep_list != NULL
4966             && IS_Bioseq (new_sep_list))
4967         {
4968           new_sep_list = SegsetFromSeqEntryList (new_sep_list);
4969         }
4970         FileClose (f);
4971         
4972         if (new_sep_list != NULL
4973             && new_sep_list->next == NULL
4974             && fpp->single
4975             && fpp->list == NULL
4976             && fpp->is_na
4977             && new_sep_list->choice == 1
4978             && new_sep_list->data.ptrvalue != NULL)
4979         {
4980           bsp = (BioseqPtr) new_sep_list->data.ptrvalue;
4981           
4982           /* assign a fake ID if there is only one sequence being imported, 
4983            * the package type is single, and there are no other sequences
4984            * from previous imports.
4985            */
4986           
4987           if (bsp->id == NULL)
4988           {
4989             bsp->id = MakeSeqID ("nuc_1");
4990           }
4991         }
4992       
4993         if (new_sep_list == NULL)
4994         {
4995           Message (MSG_ERROR, "Unable to read sequences from %s", fpp->path);
4996           fpp->path [0] = 0;
4997         }
4998         else if (! RejectZeroLengthSequences (&new_sep_list))
4999         {
5000           fpp->path [0] = 0;
5001         }
5002         else if (! RejectExtraSequences (new_sep_list, fpp))
5003         {
5004           /* if unsuccessful, delete new list */ 
5005           new_sep = new_sep_list;   
5006           while (new_sep != NULL)
5007           {
5008             test_sep = new_sep->next;
5009             SeqEntryFree (new_sep);
5010             new_sep = test_sep;
5011           }
5012           fpp->path [0] = 0;
5013         }
5014         else if (fpp->seqPackagePtr != NULL
5015                  && ! ImportedSequenceTypeOk (new_sep_list, *(fpp->seqPackagePtr)))
5016         {
5017           /* if unsuccessful, delete new list */ 
5018           new_sep = new_sep_list;   
5019           while (new_sep != NULL)
5020           {
5021             test_sep = new_sep->next;
5022             SeqEntryFree (new_sep);
5023             new_sep = test_sep;
5024           }
5025           fpp->path [0] = 0;
5026         }
5027         else if (CollectIDsAndTitles (new_sep_list, fpp->list, (fpp->is_na && ! fpp->is_mrna)))
5028         {
5029           if (fpp->is_na)
5030           {
5031             /* add default molecule type, topology, location, and genetic codes */
5032             AddDefaultModifierValues (new_sep_list);
5033           }
5034         
5035           /* if successful, link old and new lists */
5036           ValNodeLink (&(fpp->list), new_sep_list);
5037           rval = TRUE;
5038         }
5039         else
5040         {
5041           /* if unsuccessful, delete new list */ 
5042           new_sep = new_sep_list;   
5043           while (new_sep != NULL)
5044           {
5045             test_sep = new_sep->next;
5046             SeqEntryFree (new_sep);
5047             new_sep = test_sep;
5048           }
5049           fpp->path [0] = 0;
5050         }
5051       }
5052       
5053       if (fpp->list == NULL)
5054       {
5055         SafeHide (fpp->have_seq_instr_grp);
5056         Reset (fpp->doc);
5057         SafeShow (fpp->instructions);
5058         Update ();
5059         SetTitle (fpp->import_btn, "Import Nucleotide FASTA");
5060         Enable (fpp->import_btn);
5061         Disable (fpp->clear_btn);
5062       }
5063       else
5064       {        
5065         SafeHide (fpp->instructions);
5066         Update ();
5067         if (! fpp->is_na || fpp->single 
5068             || fpp->seqPackagePtr == NULL 
5069             || *fpp->seqPackagePtr == SEQ_PKG_GENOMICCDNA)
5070         {
5071           Disable (fpp->import_btn);
5072         }
5073         else
5074         {
5075           Enable (fpp->import_btn);
5076           SetTitle (fpp->import_btn, "Import Additional Nucleotide FASTA");
5077         }
5078         Enable (fpp->clear_btn);
5079         FormatFastaDoc (fpp);
5080         SafeShow (fpp->have_seq_instr_grp);
5081       }
5082       ArrowCursor ();
5083       Update ();
5084       return rval;
5085     }
5086   }
5087   return FALSE;
5088 }
5089 
5090 #define EXPORT_PAGE_WIDTH 80
5091 
5092 static void ExportSeqIdAndTitle (SeqIdPtr sip, CharPtr title, FILE *fp)
5093 {
5094   CharPtr id_str = NULL;
5095 
5096   if (fp == NULL)
5097   {
5098     return;
5099   }
5100   
5101   id_str [0] = 0;
5102   if (sip == NULL)
5103   {
5104     id_str = StringSave ("unknown_id");
5105   }
5106   else
5107   {
5108     id_str = SeqIdWholeLabel (sip, PRINTID_REPORT);
5109   }
5110   
5111   if (StringCSpn (id_str, " \t") == StringLen (id_str))
5112   {
5113     fprintf (fp, ">%s %s\n", id_str, title == NULL ? "" : title);
5114   }
5115   else
5116   {
5117     fprintf (fp, ">'%s' %s\n", id_str, title == NULL ? "" : title);
5118   }
5119   id_str = MemFree (id_str);
5120 }
5121 
5122 static void ExportSeqPort (Int4 from, Int4 to, SeqPortPtr spp, FILE *fp)
5123 {
5124   Char        buffer [EXPORT_PAGE_WIDTH + 1];
5125   Int4        seq_offset, txt_out;
5126 
5127   if (spp == NULL || fp == NULL || from < 0 || to <= from)
5128   {
5129     return;
5130   }
5131   
5132   seq_offset = from;
5133   while (seq_offset < to)
5134   {
5135     txt_out = ReadBufferFromSep (spp, buffer, seq_offset, 
5136                                  MIN (seq_offset + EXPORT_PAGE_WIDTH, to), 0);
5137     if (txt_out == 0) break;
5138     seq_offset += txt_out;
5139     fprintf(fp, "%s\n", buffer);
5140   }
5141   
5142 }
5143 
5144 static void ExportOneRawSequence (BioseqPtr bsp, CharPtr title_master, FILE *fp)
5145 {
5146   SeqDescrPtr sdp;
5147   Char        buffer [EXPORT_PAGE_WIDTH + 1];
5148   SeqPortPtr  spp;
5149   CharPtr     title = NULL;
5150   CharPtr     combined_title = NULL;
5151   Boolean     free_combined = FALSE;
5152   
5153   if (bsp == NULL || fp == NULL || bsp->repr != Seq_repr_raw)
5154   {
5155     return;
5156   }  
5157   
5158   sdp = bsp->descr;
5159   while (sdp != NULL && sdp->choice != Seq_descr_title)
5160   {
5161     sdp = sdp->next;
5162   }
5163   if (sdp != NULL)
5164   {
5165     title = sdp->data.ptrvalue;
5166   }
5167   
5168   if (StringHasNoText (title_master))
5169   {
5170     combined_title = title;
5171   }
5172   else if (StringHasNoText (title))
5173   {
5174     combined_title = title_master;
5175   }
5176   else
5177   {
5178     combined_title = (CharPtr) MemNew ((StringLen (title_master) + StringLen (title) + 2) * sizeof (Char));
5179     if (combined_title != NULL)
5180     {
5181       StringCpy (combined_title, title_master);
5182       StringCat (combined_title, " ");
5183       StringCat (combined_title, title);
5184       free_combined = TRUE;
5185     }
5186   }
5187   
5188   ExportSeqIdAndTitle (bsp->id, combined_title, fp);
5189   if (free_combined)
5190   {
5191     combined_title = MemFree (combined_title);
5192   }
5193 
5194   buffer [EXPORT_PAGE_WIDTH] = 0;
5195   
5196   spp = SeqPortNew (bsp, 0, bsp->length-1, Seq_strand_plus, Seq_code_iupacna);
5197   
5198   ExportSeqPort (0, bsp->length, spp, fp);
5199 
5200   SeqPortFree (spp);
5201   fprintf (fp, "\n");  
5202 }
5203 
5204 static void ExportOneSegmentedBioseq (BioseqPtr bsp, FILE *fp)
5205 {
5206   SeqLocPtr   slp;
5207   BioseqPtr   bsp_seg;
5208   SeqDescrPtr sdp;
5209   CharPtr     title = NULL;
5210   
5211   if (bsp == NULL || fp == NULL || bsp->repr != Seq_repr_seg)
5212   {
5213     return;
5214   }
5215   
5216   fprintf (fp, "[\n");
5217   
5218   sdp = bsp->descr;
5219   while (sdp != NULL && sdp->choice != Seq_descr_title)
5220   {
5221     sdp = sdp->next;
5222   }
5223   if (sdp != NULL)
5224   {
5225     title = sdp->data.ptrvalue;
5226   }
5227 
5228   slp = (SeqLocPtr) bsp->seq_ext;
5229   while (slp != NULL)
5230   {
5231     bsp_seg = BioseqFind (SeqLocId (slp));
5232     ExportOneRawSequence (bsp_seg, title, fp);
5233     title = NULL;
5234     slp = slp->next;
5235   }
5236   fprintf (fp, "]\n\n");
5237 }
5238 
5239 static Boolean ExportOneDeltaBioseq (BioseqPtr bsp, FILE *fp)
5240 {
5241   SeqDescrPtr sdp;
5242   CharPtr     title = NULL;
5243   DeltaSeqPtr dsp;
5244   SeqLitPtr   slip;
5245   SeqPortPtr  spp;
5246   Char        buffer [EXPORT_PAGE_WIDTH + 1];
5247   Int4        seq_offset;
5248   
5249   if (bsp == NULL || fp == NULL || bsp->repr != Seq_repr_delta
5250       || bsp->seq_ext_type != 4 || bsp->seq_ext == NULL)
5251   {
5252     return FALSE;
5253   }
5254 
5255   dsp = (DeltaSeqPtr) bsp->seq_ext;
5256   while (dsp != NULL)
5257   {
5258     if (dsp->data.ptrvalue == NULL || dsp->choice != 2)
5259     {
5260       Message (MSG_ERROR, "Can't export badly formed delta sequence!");
5261       return FALSE;
5262     }
5263     dsp = dsp->next;
5264   }
5265   
5266   sdp = bsp->descr;
5267   while (sdp != NULL && sdp->choice != Seq_descr_title)
5268   {
5269     sdp = sdp->next;
5270   }
5271   if (sdp != NULL)
5272   {
5273     title = sdp->data.ptrvalue;
5274   }
5275 
5276   ExportSeqIdAndTitle (bsp->id, title, fp);
5277   
5278   buffer [EXPORT_PAGE_WIDTH] = 0;
5279   
5280   spp = SeqPortNew (bsp, 0, bsp->length-1, Seq_strand_plus, Seq_code_iupacna);
5281   
5282   seq_offset = 0;
5283   dsp = (DeltaSeqPtr) bsp->seq_ext;
5284   while (dsp != NULL)
5285   {
5286                 slip = (SeqLitPtr) (dsp->data.ptrvalue);
5287     if (IsDeltaSeqGap(dsp)) 
5288     {
5289       if (IsDeltaSeqUnknownGap (dsp))
5290       {
5291         fprintf (fp, ">?unk100\n");
5292       }
5293       else
5294       {
5295         fprintf (fp, ">?%d\n", slip->length);
5296       }
5297     }
5298     else
5299     {
5300       ExportSeqPort (seq_offset, seq_offset + slip->length, spp, fp);
5301                 }
5302     seq_offset += slip->length;
5303     dsp = dsp->next;
5304   }
5305   fprintf (fp, "\n");
5306   return TRUE;
5307 }
5308 
5309 static void ExportFASTASeqEntryList (SeqEntryPtr sep, FILE *fp)
5310 {
5311   BioseqPtr    bsp;
5312   BioseqSetPtr bssp;
5313   
5314   if (sep == NULL || sep->data.ptrvalue == NULL || fp == NULL)
5315   {
5316     return;
5317   }
5318   
5319   if (IS_Bioseq (sep))
5320   {
5321     bsp = (BioseqPtr) sep->data.ptrvalue;
5322     if (ISA_na (bsp->mol))
5323     {
5324       if (bsp->repr == Seq_repr_raw)
5325       {
5326         if (SeqMgrGetParentOfPart (bsp, NULL) == NULL)
5327         {
5328           ExportOneRawSequence (bsp, NULL, fp);
5329         }
5330       }
5331       else if (bsp->repr == Seq_repr_seg)
5332       {
5333         ExportOneSegmentedBioseq (bsp, fp);
5334       }
5335       else if (bsp->repr == Seq_repr_delta)
5336       {
5337         ExportOneDeltaBioseq (bsp, fp);    
5338       }
5339     }
5340   }
5341   else if (IS_Bioseq_set (sep))
5342   {
5343     bssp = (BioseqSetPtr) sep->data.ptrvalue;
5344     /* we don't export the parts set because we export them
5345      * when we do the master segment 
5346      */
5347     if (bssp->_class != BioseqseqSet_class_parts)
5348     {
5349       ExportFASTASeqEntryList (bssp->seq_set, fp);
5350     }
5351   }
5352   ExportFASTASeqEntryList (sep->next, fp);
5353 }
5354 
5355 static Boolean ExportNucleotideFASTADialog (DialoG d, CharPtr filename)
5356 {
5357   CharPtr       extension;
5358   FILE          *f;
5359   FastaPagePtr  fpp;
5360   Char          path [PATH_MAX];
5361   Boolean       rval = FALSE;
5362 
5363   fpp = (FastaPagePtr) GetObjectExtra (d);
5364   if (fpp == NULL) {
5365     return FALSE;
5366   }
5367   
5368   path [0] = '\0';
5369   StringNCpy_0 (path, filename, sizeof (path));
5370 
5371   extension = NULL;
5372   if (fpp->is_mrna) {
5373     extension = GetAppProperty ("FastaNucExtension");
5374   } else if (fpp->is_na) {
5375     extension = GetAppProperty ("FastaNucExtension");
5376   } else {
5377     extension = GetAppProperty ("FastaProtExtension");
5378   }
5379   if (path [0] != '\0' || GetOutputFileName (path, sizeof (path), extension)) {
5380     f = FileOpen (path, "w");
5381     if (f == NULL)
5382     {
5383       Message (MSG_ERROR, "Unable to open %s", path);
5384     }
5385     else
5386     {
5387       WatchCursor ();
5388       ExportFASTASeqEntryList (fpp->list, f);    
5389       FileClose (f);
5390       
5391       ArrowCursor ();
5392       Update ();
5393       rval = TRUE;
5394     }
5395   }
5396   return rval;
5397 }
5398 
5399 static void CleanupFastaDialog (GraphiC g, VoidPtr data)
5400 
5401 {
5402   FastaPagePtr  fpp;
5403 
5404   fpp = (FastaPagePtr) data;
5405   if (fpp != NULL) {
5406     ResetFastaPage (fpp);
5407   }
5408   MemFree (data);
5409 }
5410 
5411 static CharPtr  fastaNucMsg = "\
5412 \nClick on 'Import Nucleotide FASTA' to read a formatted FASTA file \
5413 or 'Add/Modify Sequences' to create the file here.  The FASTA definition \
5414 line must be in the following form:\n\n\
5415 >SeqID [organism=scientific name]\n\n\
5416 where the [ and ] brackets are actually in the text.\n\
5417 Properly formatted modifiers and a title can also be included in the FASTA definition line.";
5418 
5419 
5420 static CharPtr  fastaGenMsg = "\
5421 \nPlease enter information about the genomic \
5422 sequence in the spaces above.  Then click on either \
5423 'Add/Modify Sequences' to create your sequences with the editor or \
5424 'Import Genomic FASTA' to read a previously generated FASTA file that \
5425 contains the sequence (which can be in segments).  The \
5426 FASTA definition lines may be of the following form:\n\n\
5427 >ID [organism=scientific name] [strain=name] [clone=name] title\n\n\
5428 where the [ and ] brackets are actually in the text.";
5429 
5430 static CharPtr  fastaMrnaMsg  = "\
5431 \nPlease enter information about the transcript \
5432 sequences in the spaces above.  Then click on \
5433 'Import Transcript FASTA' to read a FASTA file that \
5434 contains the sequence (which can be in segments).  The \
5435 FASTA definition lines may be of the following form:\n\n\
5436 >ID [gene=symbol] [mrna=name] title\n\n\
5437 where the [ and ] brackets are actually in the text.";
5438 
5439 static CharPtr  fastaProtMsg = "\
5440 \nPlease enter information about the protein \
5441 sequences in the spaces above.  Then click on \
5442 'Import Protein FASTA' to read a FASTA file that \
5443 contains the sequences.  The FASTA definition lines should \
5444 be of the following form:\n\n\
5445 >ID [gene=symbol] [protein=name] title\n\n\
5446 where the [ and ] brackets are actually in the text.";
5447 
5448 static CharPtr GetFastaSettingName (FastaPagePtr fpp)
5449 {
5450   if (fpp == NULL)
5451   {
5452         return NULL;
5453   }
5454   else if (fpp->is_mrna)
5455   {
5456     return "PARSEMRNASEQID";    
5457   }
5458   else if (fpp->is_na)
5459   {
5460     return "PARSENUCSEQID";
5461   }
5462   else
5463   {
5464     return "PARSEPROTSEQID";
5465   }
5466 }
5467 
5468 static void ChangeIDParse (ButtoN b)
5469 {
5470   FastaPagePtr      fpp;
5471   CharPtr           setting_name;
5472 
5473   fpp = (FastaPagePtr) GetObjectExtra (b);
5474   if (fpp != NULL) {
5475     fpp->parseSeqId = GetStatus (b);
5476   
5477     setting_name = GetFastaSettingName (fpp);
5478   
5479     if (fpp->parseSeqId) {
5480       SetAppParam ("SEQUINCUSTOM", "PREFERENCES", setting_name, "TRUE");
5481       SafeHide (fpp->singleIdGrp);
5482     } else {
5483       SetAppParam ("SEQUINCUSTOM", "PREFERENCES", setting_name, "FALSE");
5484       if (fpp->single)
5485       {
5486         SafeShow (fpp->singleIdGrp);
5487       }
5488       else
5489       {
5490         SafeHide (fpp->singleIdGrp);
5491       }
5492     }
5493   }
5494 }
5495 
5496 extern DialoG CreateFastaDialog (GrouP h, CharPtr title,
5497                                  Boolean is_na, Boolean is_mrna, CharPtr text,
5498                                  Boolean single, Int2Ptr seqPackagePtr)
5499 
5500 {
5501   FastaPagePtr  fpp;
5502   GrouP         g;
5503   GrouP         m;
5504   GrouP         p;
5505   GrouP         s;
5506   PrompT        pr;
5507   CharPtr       setting_name;
5508   ButtoN        prs = NULL;
5509   Char          str [32];
5510   Boolean       parseSeqId;
5511 #ifdef WIN_MAC
5512   Int2          wid = 25;
5513 #else
5514   Int2          wid = 33;
5515 #endif
5516 
5517   p = HiddenGroup (h, 1, 0, NULL);
5518   SetGroupSpacing (p, 10, 10);
5519 
5520   fpp = (FastaPagePtr) MemNew (sizeof (FastaPage));
5521   if (fpp != NULL) {
5522 
5523     SetObjectExtra (p, fpp, CleanupFastaDialog);
5524     fpp->dialog = (DialoG) p;
5525     fpp->todialog = NULL;
5526     fpp->fromdialog = NULL;
5527     fpp->importdialog = ImportFastaDialog;
5528     if (is_na)
5529     {
5530       fpp->exportdialog = ExportNucleotideFASTADialog;
5531     }
5532     else
5533     {
5534       fpp->exportdialog = NULL;
5535     }
5536 
5537     fpp->seqPackagePtr = seqPackagePtr;
5538     if (title != NULL && title [0] != '\0') {
5539       s = NormalGroup (p, 0, -2, title, systemFont, NULL);
5540     } else {
5541       s = HiddenGroup (p, 0, -2, NULL);
5542     }
5543     m = HiddenGroup (s, -1, 0, NULL);
5544 
5545     fpp->path [0] = '\0';
5546     fpp->is_na = is_na;
5547     fpp->is_mrna = is_mrna;
5548     fpp->single = single;
5549 
5550     setting_name = GetFastaSettingName (fpp);
5551     
5552     if (GetAppParam ("SEQUINCUSTOM", "SETTINGS", "ALLOWNOSEQID", NULL, str, sizeof (str))
5553         && StringICmp (str, "TRUE") == 0)
5554     {
5555       prs = CheckBox (m, "Fasta definition line starts with sequence ID", ChangeIDParse);
5556       SetObjectExtra (prs, fpp, NULL);
5557     }
5558     parseSeqId = FALSE;
5559     if (GetAppParam ("SEQUINCUSTOM", "PREFERENCES", setting_name, NULL, str, sizeof (str))) {
5560       if (StringICmp (str, "TRUE") == 0) {
5561         parseSeqId = TRUE;
5562       }
5563     }
5564     else
5565     {
5566       parseSeqId = TRUE;
5567     }
5568     SetStatus (prs, parseSeqId);
5569     
5570     fpp->parseSeqId = parseSeqId;
5571     if (fpp->single) {
5572       fpp->singleIdGrp = HiddenGroup (m, 2, 0, NULL);
5573       StaticPrompt (fpp->singleIdGrp, "Enter unique identifier for this sequence", 0, dialogTextHeight, programFont, 'l');
5574       fpp->singleSeqID = DialogText (fpp->singleIdGrp, "", 6, NULL);
5575       if (parseSeqId) {
5576         Hide (fpp->singleIdGrp);
5577       }
5578     }
5579 
5580     g = HiddenGroup (m, 0, 0, NULL);
5581     fpp->instructions = MultiLinePrompt (g, text, 27 * stdCharWidth, programFont);
5582     fpp->have_seq_instr_grp = HiddenGroup (g, -1, 0, NULL);
5583     SetGroupSpacing (fpp->have_seq_instr_grp, 10, 10);
5584     fpp->doc = DocumentPanel (fpp->have_seq_instr_grp, stdCharWidth * wid, stdLineHeight * 12);
5585     SetDocAutoAdjust (fpp->doc, FALSE);
5586     pr = StaticPrompt (fpp->have_seq_instr_grp, "Choose Clear from the Edit menu to clear these sequences", 0, dialogTextHeight, systemFont, 'c');
5587     AlignObjects (ALIGN_CENTER, (HANDLE) fpp->doc, (HANDLE) pr, NULL);
5588     Hide (fpp->have_seq_instr_grp);
5589     AlignObjects (ALIGN_CENTER, (HANDLE) fpp->instructions,
5590                   (HANDLE) fpp->have_seq_instr_grp, NULL);
5591                   
5592     AlignObjects (ALIGN_CENTER, (HANDLE) g,
5593                                 (HANDLE) prs,
5594                                 (HANDLE) fpp->singleIdGrp,
5595                                 NULL);                  
5596   }
5597 
5598   return (DialoG) p;
5599 }
5600 
5601 typedef struct phylippage {
5602   DIALOG_MESSAGE_BLOCK
5603   Uint1        format;
5604   Char         path [PATH_MAX];
5605   SeqEntryPtr  sep;
5606   ValNodePtr   errmsgs;
5607   DoC          doc;
5608   GrouP        instructions;
5609   Char         extension [10];
5610   Int4         type;
5611   TSequenceInfoPtr aln_settings;
5612 
5613 } PhylipPage, PNTR PhylipPagePtr;
5614 
5615 
5616 #define PhylipFormatBufLen 1000
5617 
5618 static void FormatPhylipDoc (PhylipPagePtr ppp)
5619 
5620 {
5621   Nlm_QualNameAssocPtr ap;
5622   BioseqPtr          bsp;
5623   BioseqSetPtr       bssp;
5624   CharPtr            label;
5625   Int4               len;
5626   CharPtr            measure;
5627   SeqEntryPtr        nsep;
5628   Int2               num;
5629   CharPtr            plural;
5630   SeqIdPtr           sip;
5631   SeqEntryPtr        sep;
5632   CharPtr            str;
5633   CharPtr            title;
5634   CharPtr            ttl;
5635   CharPtr            tmp;
5636   CharPtr            valstr;
5637   ValNodePtr         vnp;
5638 
5639   if (ppp != NULL) {
5640     str = MemNew (sizeof (char) * PhylipFormatBufLen);
5641     tmp = MemNew (sizeof (char) * PhylipFormatBufLen);
5642     if (str == NULL || tmp == NULL) return;
5643     num = 0;
5644     len = 0;
5645     sep = ppp->sep;
5646     if (sep != NULL && IS_Bioseq_set (sep)) {
5647       bssp = (BioseqSetPtr) sep->data.ptrvalue;
5648       if (bssp != NULL && (bssp->_class == 7 ||
5649                            (IsPopPhyEtcSet (bssp->_class)))) {
5650         for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5651           num++;
5652           if (IS_Bioseq (sep)) {
5653             bsp = (BioseqPtr) sep->data.ptrvalue;
5654             if (bsp != NULL) {
5655               len += bsp->length;
5656             }
5657           } else if (IS_Bioseq_set (sep)) {
5658             nsep = FindNucSeqEntry (sep);
5659             if (nsep != NULL && IS_Bioseq (nsep)) {
5660               bsp = (BioseqPtr) nsep->data.ptrvalue;
5661               if (bsp != NULL) {
5662                 len += bsp->length;
5663               }
5664             }
5665           }
5666         }
5667       }
5668     }
5669     if (num > 1) {
5670       plural = "s";
5671     } else {
5672       plural = "";
5673     }
5674     label = "Sequence";
5675     measure = "nucleotides";
5676     sprintf (str, "%d nucleotide sequence%s, total length %ld %s\n",
5677              (int) num, plural, (long) len, measure);
5678     AppendText (ppp->doc, str, &faParFmt, &faColFmt, programFont);
5679     vnp = ppp->errmsgs;
5680     num = 0;
5681     sep = ppp->sep;
5682     if (sep != NULL && IS_Bioseq_set (sep)) {
5683       bssp = (BioseqSetPtr) sep->data.ptrvalue;
5684       if (bssp != NULL && (bssp->_class == 7 ||
5685                            (IsPopPhyEtcSet (bssp->_class)))) {
5686         for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5687           nsep = NULL;
5688           num++;
5689           len = 0;
5690           sip = NULL;
5691           tmp [0] = '\0';
5692           if (IS_Bioseq (sep)) {
5693             bsp = (BioseqPtr) sep->data.ptrvalue;
5694             if (bsp != NULL) {
5695               len = bsp->length;
5696               sip = SeqIdFindWorst (bsp->id);
5697               SeqIdWrite (sip, tmp, PRINTID_REPORT, FastaFormatBufLen);
5698             }
5699           } else if (IS_Bioseq_set (sep)) {
5700             nsep = FindNucSeqEntry (sep);
5701             if (nsep != NULL && IS_Bioseq (nsep)) {
5702               bsp = (BioseqPtr) nsep->data.ptrvalue;
5703               if (bsp != NULL) {
5704                 len = bsp->length;
5705                 sip = SeqIdFindWorst (bsp->id);
5706                 SeqIdWrite (sip, tmp, PRINTID_REPORT, FastaFormatBufLen);
5707               }
5708             }
5709           }
5710           sprintf (str, "\n%s %d\nLength: %ld %s\nSequence ID: %s\n", label,
5711                    (int) num, (long) len, measure, tmp);
5712           ttl = NULL;
5713           SeqEntryExplore (nsep, (Pointer) (&ttl), FindFirstTitle);
5714           title = StringSaveNoNull (ttl);
5715           if (title != NULL) {
5716             valstr = FindValueFromPairInDefline ("organism", title);
5717             if (!StringHasNoText (valstr)) {
5718               AddReportLine (str, "Organism", valstr);
5719             }
5720             valstr = MemFree (valstr);
5721             RemoveValueFromDefline ("organism", title);
5722             
5723             valstr = FindValueFromPairInDefline ("lineage", title);
5724             if (!StringHasNoText (valstr)) {
5725               AddReportLine (str, "Lineage", valstr);
5726             }
5727             valstr = MemFree (valstr);
5728             RemoveValueFromDefline ("lineage", title);
5729 
5730             for (ap = current_orgmod_subtype_alist; ap->name != NULL; ap++) {
5731               if (IsNonTextModifier (ap->name))
5732               {
5733                 if (FindValuePairInDefLine (ap->name, title, NULL) != NULL)
5734                 {
5735                   AddReportLine (str, ap->name, "TRUE");
5736                   RemoveValueFromDefline (ap->name, title);
5737                 }
5738               }
5739               else
5740               {
5741                 valstr = FindValueFromPairInDefline (ap->name, title);
5742                 if (!StringHasNoText (valstr)) {
5743                   AddReportLine (str, ap->name, title);  
5744                 }
5745                 valstr = MemFree (valstr);
5746                 RemoveValueFromDefline (ap->name, title);
5747               }
5748             }
5749             for (ap = current_subsource_subtype_alist; ap->name != NULL; ap++) {
5750               if (IsNonTextModifier (ap->name))
5751               {
5752                 if (FindValuePairInDefLine (ap->name, title, NULL) != NULL)
5753                 {
5754                   AddReportLine (str, ap->name, "TRUE");
5755                   RemoveValueFromDefline (ap->name, title);
5756                 }
5757               }
5758               else
5759               {
5760                 valstr = FindValueFromPairInDefline (ap->name, title);
5761                 if (!StringHasNoText (valstr)) {
5762                   AddReportLine (str, ap->name, title);  
5763                 }
5764                 valstr = MemFree (valstr);
5765                 RemoveValueFromDefline (ap->name, title);
5766               }
5767             }
5768             
5769             valstr = FindValueFromPairInDefline ("note-orgmod", title);
5770             if (!StringHasNoText (valstr)) {
5771               AddReportLine (str, "Note", valstr);
5772             }
5773             valstr = MemFree (valstr);
5774             RemoveValueFromDefline ("note-orgmod", title);
5775             
5776             valstr = FindValueFromPairInDefline ("note-subsrc", title);
5777             if (!StringHasNoText (valstr)) {
5778               AddReportLine (str, "Note", valstr);
5779             }
5780             valstr = MemFree (valstr);
5781             RemoveValueFromDefline ("note-subsrc", title);
5782             
5783             valstr = FindValueFromPairInDefline ("molecule", title);
5784             if (!StringHasNoText (valstr)) {
5785               AddReportLine (str, "Molecule", valstr);
5786             }
5787             valstr = MemFree (valstr);
5788             RemoveValueFromDefline ("molecule", title);
5789             
5790             valstr = FindValueFromPairInDefline ("moltype", title);
5791             if (!StringHasNoText (valstr)) {
5792               AddReportLine (str, "MolType", valstr);
5793             }
5794             valstr = MemFree (valstr);
5795             RemoveValueFromDefline ("moltype", title);
5796             
5797             valstr = FindValueFromPairInDefline ("location", title);
5798             if (!StringHasNoText (valstr)) {
5799               AddReportLine (str, "Location", valstr);
5800             }
5801             valstr = MemFree (valstr);
5802             RemoveValueFromDefline ("location", valstr);
5803 
5804             TrimSpacesAroundString (title);
5805             if (! StringHasNoText (title)) {
5806               StringCat (str, "Title: ");
5807               StringNCat (str, title, 128);
5808               StringCat (str, "\n");
5809             } else {
5810               StringCat (str, "No title detected\n");
5811             }
5812           }
5813           MemFree (title);
5814           if (vnp != NULL && vnp->data.ptrvalue != NULL) {
5815             StringCat (str, (CharPtr) vnp->data.ptrvalue);
5816             StringCat (str, "\n");
5817           }
5818           AppendText (ppp->doc, str, &faParFmt, &faColFmt, programFont);
5819           if (vnp != NULL) {
5820             vnp = vnp->next;
5821           }
5822         }
5823       }
5824     }
5825     MemFree (str);
5826     MemFree (tmp);
5827     UpdateDocument (ppp->doc, 0, 0);
5828   }
5829 }
5830 
5831 static void ResetPhylipPage (PhylipPagePtr ppp)
5832 
5833 {
5834   if (ppp != NULL) {
5835     ppp->sep = SeqEntryFree (ppp->sep);
5836     ppp->errmsgs = ValNodeFreeData (ppp->errmsgs);
5837   }
5838 }
5839 
5840 static CharPtr noOrgInTitleWarning =
5841 "sequences have organism information in titles. " \
5842 "It is critical to annotate the data file with organism and source information. " \
5843 "Please quit Sequin and read the Sequin Quick Guide section on preparing the data files before proceeding.";
5844 
5845 static void CountTitlesWithoutOrganisms (SeqEntryPtr sep)
5846 {
5847   IDAndTitleEditPtr iatep;
5848   Int4              seq_num;
5849   CharPtr           org_name;
5850   Int4              num_sequences = 0, num_with_orgs = 0;
5851   
5852   iatep = SeqEntryListToIDAndTitleEdit (sep);
5853   if (iatep == NULL)
5854   {
5855     return;
5856   }
5857   
5858   for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
5859   {
5860     if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
5861     {
5862       continue;
5863     }
5864     num_sequences ++;
5865     org_name = FindValueFromPairInDefline ("organism", iatep->title_list [seq_num]);
5866     if (!StringHasNoText (org_name))
5867     {
5868       num_with_orgs ++;
5869     }
5870     org_name = MemFree (org_name);
5871   }
5872   iatep = IDAndTitleEditFree (iatep);
5873   if (num_sequences != num_with_orgs && num_with_orgs != 0)
5874   {
5875     Message (MSG_OK, "%d of %d %s", num_sequences - num_with_orgs, (int) num_sequences, noOrgInTitleWarning);
5876   }
5877   
5878 }
5879 
5880 static CharPtr  phylipNucMsg = "\
5881 \nClick 'Import Nucleotide Alignment' to load your \
5882 nucleotide alignment file.\n\nClick on 'Custom Alignment Settings' \
5883 if Sequin has trouble reading your alignment file.";
5884 
5885 static void SetPhylipDocInstructions (PhylipPagePtr ppp)
5886 {
5887   if (ppp == NULL || ppp->doc == NULL) return;
5888   Reset (ppp->doc);
5889   AppendText (ppp->doc, phylipNucMsg, &faParFmt, &faColFmt, programFont);
5890   UpdateDocument (ppp->doc, 0, 0);
5891   Update ();
5892 }
5893 
5894 static Boolean ImportPhylipDialog (DialoG d, CharPtr filename)
5895 {
5896   Char           path [PATH_MAX];
5897   PhylipPagePtr  ppp;
5898   SeqEntryPtr    sep;
5899   RecT           r;
5900   FILE           *fp;
5901   ObjMgrDataPtr  omdptop;
5902   ObjMgrData     omdata;
5903   Uint2          parenttype;
5904   Pointer        parentptr;
5905   Char           errStr [PATH_MAX + 64];
5906   CharPtr        no_org_err_msg = NULL;
5907 
5908   if (d == NULL || filename == NULL) return FALSE;
5909 
5910   path [0] = '\0';
5911   StringNCpy_0 (path, filename, sizeof (path));
5912   ppp = (PhylipPagePtr) GetObjectExtra (d);
5913   if (ppp == NULL) {
5914     return FALSE;
5915   }
5916 
5917   if (path [0] != '\0' || GetInputFileName (path, sizeof (path), ppp->extension, "TEXT")) {
5918     WatchCursor ();
5919     StringCpy (ppp->path, path);
5920     ObjectRect (ppp->doc, &r);
5921     InsetRect (&r, 4, 4);
5922     faColFmt.pixWidth = r.right - r.left;
5923     Reset (ppp->doc);
5924     Update ();
5925     ppp->sep = SeqEntryFree (ppp->sep);
5926     fp = FileOpen (path, "r");
5927     if (fp != NULL) {
5928       ppp->sep = SeqEntryFromAlignmentFile (fp, ppp->aln_settings,
5929                                             Seq_mol_na, no_org_err_msg);
5930                                             
5931       /* check for bracketing issues here */
5932       if (CollectIDsAndTitles (ppp->sep, NULL, TRUE))
5933       {
5934         /* add default molecule type, topology, location, and genetic codes */
5935         AddDefaultModifierValues (ppp->sep);
5936       }        
5937       else
5938       {
5939         ppp->sep = SeqEntryFree (ppp->sep);
5940       }
5941                                                   
5942       sep = ppp->sep;
5943       if (sep != NULL) {
5944         SaveSeqEntryObjMgrData (ppp->sep, &omdptop, &omdata);
5945         GetSeqEntryParent (ppp->sep, &parentptr, &parenttype);
5946         SeqMgrLinkSeqEntry (sep, parenttype, parentptr);
5947         RestoreSeqEntryObjMgrData (sep, omdptop, &omdata);
5948 
5949         FormatPhylipDoc (ppp);
5950         SafeShow (ppp->doc);
5951 
5952         CountTitlesWithoutOrganisms (sep);
5953       } else {
5954         SendHelpScrollMessage (helpForm, "Nucleotide Page", "Nucleotide Page for Aligned Data Formats");
5955         SetPhylipDocInstructions (ppp);
5956       }
5957     } else {
5958       SetPhylipDocInstructions (ppp);
5959     }
5960   } else {
5961         sprintf (errStr, "ERROR: Unable to open file %s\n\n", path);
5962         AppendText (ppp->doc, errStr, &faParFmt, &faColFmt, programFont);
5963         AppendText (ppp->doc, strerror(errno), &faParFmt, &faColFmt, programFont);
5964         SafeShow (ppp->doc);
5965     Update ();
5966   }
5967   ArrowCursor ();
5968   Update ();
5969   return TRUE;
5970 }
5971 
5972 static void CleanupPhylipDialog (GraphiC g, VoidPtr data)
5973 
5974 {
5975   PhylipPagePtr  ppp;
5976 
5977   ppp = (PhylipPagePtr) data;
5978   if (ppp != NULL) {
5979     ResetPhylipPage (ppp);
5980     SequenceInfoFree (ppp->aln_settings);
5981     ppp->aln_settings = NULL;
5982   }
5983   MemFree (data);
5984 }
5985 
5986 
5987 static DialoG CreatePhylipDialog (GrouP h, CharPtr title, CharPtr text,
5988                                   Int2 format, CharPtr extension,
5989                                   Int4 type)
5990 
5991 {
5992   PhylipPagePtr  ppp;
5993   GrouP          g;
5994   GrouP          m;
5995   GrouP          p;
5996   GrouP          s;
5997   RecT          r;
5998 
5999   p = HiddenGroup (h, 1, 0, NULL);
6000   SetGroupSpacing (p, 10, 10);
6001 
6002   ppp = (PhylipPagePtr) MemNew (sizeof (PhylipPage));
6003   if (ppp != NULL) {
6004 
6005     SetObjectExtra (p, ppp, CleanupPhylipDialog);
6006     ppp->dialog = (DialoG) p;
6007     ppp->todialog = NULL;
6008     ppp->fromdialog = NULL;
6009     ppp->importdialog = ImportPhylipDialog;
6010     ppp->type = type;
6011 
6012     if (title != NULL && title [0] != '\0') {
6013       s = NormalGroup (p, 0, -2, title, systemFont, NULL);
6014     } else {
6015       s = HiddenGroup (p, 0, -2, NULL);
6016     }
6017     m = HiddenGroup (s, -1, 0, NULL);
6018 
6019     ppp->format = format;
6020     ppp->path [0] = '\0';
6021     StringNCpy_0 (ppp->extension, extension, sizeof (ppp->extension));
6022   
6023     g = HiddenGroup (m, 0, 0, NULL);
6024     ppp->doc = DocumentPanel (g, stdCharWidth * 27, stdLineHeight * 8);
6025     ObjectRect (ppp->doc, &r);
6026     InsetRect (&r, 4, 4);
6027     faColFmt.pixWidth = r.right - r.left;
6028 
6029     ppp->aln_settings = GetDefaultSequenceInfo();
6030 
6031     SetPhylipDocInstructions (ppp);
6032   }
6033 
6034   return (DialoG) p;
6035 }
6036 
6037 #define NUCLEOTIDE_PAGE   0
6038 #define ORGANISM_PAGE     1
6039 #define MRNA_PAGE         2
6040 #define PROTEIN_PAGE      3
6041 #define ANNOTATE_PAGE     4
6042 
6043 /*---------------------------------------------------------------------*/
6044 /*                                                                     */
6045 /* HasZeroLengthSequence () -- Checks to see if any of a submission's  */
6046 /*                             sequences are missing (ie -- zero       */
6047 /*                             length).                                */
6048 /*                                                                     */
6049 /*---------------------------------------------------------------------*/
6050 
6051 extern Boolean HasZeroLengthSequence (ForM newForm)
6052 {
6053   SequencesFormPtr  sqfp;
6054   FastaPagePtr      fpp;
6055   SeqEntryPtr       sep;
6056   BioseqPtr         bsp;
6057 
6058   /* Get the list of Bioseqs to check */
6059 
6060   sqfp = (SequencesFormPtr) GetObjectExtra (newForm);
6061   if (NULL == sqfp)
6062     return TRUE;
6063 
6064   fpp = GetObjectExtra (sqfp->dnaseq);
6065   sep = fpp->list;
6066 
6067   /* Check the list */
6068 
6069   while (NULL != sep) {
6070     if (sep->choice == 1) { 
6071       bsp = (BioseqPtr) sep->data.ptrvalue;
6072       if (bsp->length <= 0)
6073         return TRUE;
6074     }
6075     sep = sep->next;
6076   }
6077 
6078   /* If we made it to here, then */
6079   /* there were none found.      */
6080 
6081   return FALSE;
6082 }
6083 
6084 extern Boolean SequencesFormHasProteins (ForM f)
6085 
6086 {
6087   FastaPagePtr      fpp;
6088   SequencesFormPtr  sqfp;
6089 
6090   sqfp = (SequencesFormPtr) GetObjectExtra (f);
6091   if (sqfp != NULL) {
6092     if (PackageTypeIsSet (sqfp->seqPackage)) return TRUE;
6093     fpp = GetObjectExtra (sqfp->protseq);
6094     if (fpp != NULL) {
6095       if (fpp->path [0] != '\0') {
6096         return TRUE;
6097       }
6098     }
6099   }
6100   return FALSE;
6101 }
6102 
6103 extern SeqEntryPtr GetSequencesFormProteinList (ForM f)
6104 
6105 {
6106   FastaPagePtr      fpp;
6107   SequencesFormPtr  sqfp;
6108 
6109   sqfp = (SequencesFormPtr) GetObjectExtra (f);
6110   if (sqfp != NULL) {
6111     fpp = GetObjectExtra (sqfp->protseq);
6112     if (fpp != NULL) {
6113       return fpp->list;
6114     }
6115   }
6116   return NULL;
6117 }
6118 
6119 static SeqEntryPtr GetSeqEntryFromSequencesForm (SequencesFormPtr sqfp)
6120 {
6121   SeqEntryPtr list = NULL;
6122   FastaPagePtr       fpp;
6123   PhylipPagePtr      ppp;
6124   SeqEntryPtr        sep;
6125   BioseqSetPtr       bssp;
6126   
6127   if (sqfp == NULL) return NULL;
6128 
6129   if (sqfp->seqPackage == SEQ_PKG_SEGMENTED) 
6130   {
6131     fpp = (FastaPagePtr) GetObjectExtra (sqfp->dnaseq);
6132     if (fpp != NULL) 
6133     {
6134       list = fpp->list;
6135     }
6136   }
6137   else if (sqfp->seqFormat == SEQ_FMT_FASTA) {
6138     fpp = (FastaPagePtr) GetObjectExtra (sqfp->dnaseq);
6139     if (fpp != NULL) 
6140     {
6141       list = fpp->list;
6142     }
6143   } else if (sqfp->seqFormat == SEQ_FMT_ALIGNMENT) {
6144     ppp = (PhylipPagePtr) GetObjectExtra (sqfp->dnaseq);
6145     if (ppp != NULL) {
6146       sep = ppp->sep;
6147       if (sep != NULL && IS_Bioseq_set (sep)) {
6148         bssp = (BioseqSetPtr) sep->data.ptrvalue;
6149         if (bssp != NULL) {
6150           list = bssp->seq_set;
6151         }
6152       }
6153     }
6154   }
6155   return list;
6156 }
6157 
6158 extern SeqEntryPtr GetSequencesFormNucleotideList (ForM f)
6159 {
6160   SequencesFormPtr  sqfp;
6161 
6162   sqfp = (SequencesFormPtr) GetObjectExtra (f);
6163   if (sqfp != NULL) {
6164     return GetSeqEntryFromSequencesForm (sqfp);
6165   }
6166   return NULL;
6167 }
6168 
6169 extern Boolean SequencesFormHasTooManyNucleotides (ForM f)
6170 
6171 {
6172   FastaPagePtr      fpp;
6173   SequencesFormPtr  sqfp;
6174 
6175   sqfp = (SequencesFormPtr) GetObjectExtra (f);
6176   if (sqfp != NULL && PackageTypeIsSingle (sqfp->seqPackage))
6177   {
6178     fpp = GetObjectExtra (sqfp->dnaseq);
6179     if (fpp != NULL) {
6180       if (fpp->list != NULL && fpp->list->next != NULL) {
6181         return TRUE;
6182       }
6183     }
6184   }
6185   return FALSE;
6186 }
6187 
6188 extern DialoG CreateTagListDialogEx (GrouP h, Uint2 rows, Uint2 cols,
6189                                      Int2 spacing, Uint2Ptr types,
6190                                      Uint2Ptr textWidths, EnumFieldAssocPtr PNTR alists,
6191                                      Boolean useBar, Boolean noExtend,
6192                                      ToDialogFunc tofunc, FromDialogFunc fromfunc);
6193 
6194 static ValNodePtr 
6195 BuildModifierTypeList 
6196 (ValNodePtr type_list,
6197  CharPtr    new_title, 
6198  Boolean    allow_prot)
6199 {
6200   ValNodePtr      modifier_info_list;
6201   ValNodePtr      info_vnp, type_vnp;
6202   ModifierInfoPtr mip;
6203   
6204   modifier_info_list = ParseAllBracketedModifiers (new_title);
6205   for (info_vnp = modifier_info_list; info_vnp != NULL; info_vnp = info_vnp->next)
6206   {
6207     mip = (ModifierInfoPtr)info_vnp->data.ptrvalue;
6208     if (mip == NULL 
6209         || mip->modtype == eModifierType_Protein
6210         || mip->modtype == eModifierType_Organism)
6211     {
6212       continue;
6213     }
6214     if (mip->modtype == eModifierType_SourceQual)
6215     {
6216           for (type_vnp = type_list;
6217                type_vnp != NULL 
6218                  && (type_vnp->choice != mip->subtype 
6219                      || StringICmp (type_vnp->data.ptrvalue, mip->name) != 0); 
6220                type_vnp = type_vnp->next)
6221           {
6222           }
6223     }
6224     else
6225     {
6226           for (type_vnp = type_list;
6227                type_vnp != NULL && StringICmp (type_vnp->data.ptrvalue, mip->name) != 0;
6228                type_vnp = type_vnp->next)
6229           {
6230           }
6231     }
6232         if (type_vnp == NULL)
6233         {
6234           type_vnp = ValNodeNew (type_list);
6235           if (type_list == NULL) type_list = type_vnp;
6236           if (type_vnp != NULL)
6237           {
6238                 type_vnp->choice = mip->subtype;
6239                 type_vnp->data.ptrvalue = StringSave (mip->name);
6240           }
6241         }
6242   }
6243   ModifierInfoListFree (modifier_info_list);
6244   return type_list;
6245 }
6246 
6247 
6248 static Uint2 modedit_widths [] = {
6249   0, 0,
6250 };
6251 
6252 ENUM_ALIST(nontextmodedit_alist)
6253   {"FALSE",             0},
6254   {"TRUE",              1},
6255 END_ENUM_ALIST
6256 
6257 extern void ConfirmSequencesFormParsing (ForM f, FormActnFunc putItAllTogether)
6258 
6259 {
6260   SequencesFormPtr  sqfp;
6261 
6262   sqfp = (SequencesFormPtr) GetObjectExtra (f);
6263   if (sqfp != NULL && putItAllTogether != NULL) {
6264     putItAllTogether (sqfp->form);
6265   }
6266 }
6267 
6268 extern void AddToSubSource (BioSourcePtr biop, CharPtr title, CharPtr label, Uint1 subtype)
6269 
6270 {
6271   CharPtr       ptr;
6272   SubSourcePtr  ssp;
6273   CharPtr       str;
6274   SubSourcePtr  tmpssp;
6275 
6276   if (biop == NULL || title == NULL || label == NULL) return;
6277   str = MemNew (StringLen (title));
6278   if (str == NULL) return;
6279   ptr = StringISearch (title, label);
6280   if (ptr != NULL) {
6281     StringCpy (str, ptr + StringLen (label));
6282     ptr = StringChr (str, ']');
6283     if (ptr != NULL) {
6284       *ptr = '\0';
6285       TrimSpacesAroundString (str);
6286       ssp = SubSourceNew ();
6287       if (biop->subtype == NULL) {
6288         biop->subtype = ssp;
6289       } else {
6290         tmpssp = biop->subtype;
6291         while (tmpssp->next != NULL) {
6292           tmpssp = tmpssp->next;
6293         }
6294         tmpssp->next = ssp;
6295       }
6296       if (ssp != NULL) {
6297         ssp->subtype = subtype;
6298         ssp->name = StringSave (str);
6299       }
6300     }
6301   }
6302   MemFree (str);
6303 }
6304 
6305 extern void AddToOrgMod (BioSourcePtr biop, CharPtr title, CharPtr label, Uint1 subtype)
6306 
6307 {
6308   OrgModPtr   mod;
6309   OrgNamePtr  onp;
6310   OrgRefPtr   orp;
6311   CharPtr     ptr;
6312   CharPtr     str;
6313   OrgModPtr   tmpmod;
6314 
6315   if (biop == NULL || title == NULL || label == NULL) return;
6316   str = MemNew (StringLen (title));
6317   if (str == NULL) return;
6318   ptr = StringISearch (title, label);
6319   if (ptr != NULL) {
6320     StringCpy (str, ptr + StringLen (label));
6321     ptr = StringChr (str, ']');
6322     if (ptr != NULL) {
6323       *ptr = '\0';
6324       TrimSpacesAroundString (str);
6325       orp = biop->org;
6326       if (orp == NULL) {
6327         orp = OrgRefNew ();
6328         biop->org = orp;
6329       }
6330       if (orp != NULL) {
6331         onp = orp->orgname;
6332         if (onp == NULL) {
6333           onp = OrgNameNew ();
6334           orp->orgname = onp;
6335         }
6336         if (onp != NULL) {
6337           mod = OrgModNew ();
6338           if (onp->mod == NULL) {
6339             onp->mod = mod;
6340           } else {
6341             tmpmod = onp->mod;
6342             while (tmpmod->next != NULL) {
6343               tmpmod = tmpmod->next;
6344             }
6345             tmpmod->next = mod;
6346           }
6347           if (mod != NULL) {
6348             mod->subtype = subtype;
6349             mod->subname = StringSave (str);
6350           }
6351         }
6352       }
6353     }
6354   }
6355   MemFree (str);
6356 }
6357 
6358 #define PROC_NUC_STR_SIZE 4096
6359 
6360 static Int4 TopologyFromString (CharPtr str)
6361 {
6362   EnumFieldAssocPtr  eap;
6363 
6364   for (eap = topology_nuc_alist; eap != NULL && eap->name != NULL; eap++)
6365   {
6366     if (StringICmp (eap->name, str) == 0)
6367     {
6368       return eap->value;
6369     }
6370   }
6371   return 1; 
6372 }
6373 
6374 static BioSourcePtr AddOrgRef (BioSourcePtr biop)
6375 {
6376   if (biop == NULL)
6377   {
6378     biop = BioSourceNew ();
6379   }
6380   if (biop == NULL)
6381   {
6382     return NULL;
6383   }
6384   if (biop->org == NULL)
6385   {
6386     biop->org = OrgRefNew ();
6387   }
6388   if (biop->org == NULL)
6389   {
6390     biop = BioSourceFree (biop);
6391     return NULL;
6392   }
6393   return biop;
6394 }
6395 
6396 static BioSourcePtr AddOrgName (BioSourcePtr biop)
6397 {
6398   biop = AddOrgRef (biop);
6399   if (biop == NULL || biop->org == NULL)
6400   {
6401     biop = BioSourceFree (biop);
6402     return NULL; 
6403   }
6404   if (biop->org->orgname == NULL)
6405   {
6406     biop->org->orgname = OrgNameNew ();
6407     if (biop->org->orgname == NULL)
6408     {
6409       biop = BioSourceFree (biop);
6410       return NULL;
6411     }
6412   }
6413   return biop;
6414 }
6415 
6416 static BioSourcePtr SetGeneticCodeForBioSource (BioSourcePtr biop, Int4 gcode, Boolean is_nuc)
6417 {
6418   OrgRefPtr  orp;
6419   OrgNamePtr onp;
6420 
6421   if (gcode < 0)
6422   {
6423     return biop;
6424   }
6425 
6426   biop = AddOrgName (biop);
6427   if (biop == NULL)
6428   {
6429     return biop;
6430   }
6431   
6432   orp = biop->org;
6433   if (biop->org == NULL)
6434   {
6435     biop->org = OrgRefNew ();
6436     orp = biop->org;
6437   }
6438   if (orp != NULL) {
6439     onp = orp->orgname;
6440     if (onp == NULL) {
6441       onp = OrgNameNew ();
6442       orp->orgname = onp;
6443     }
6444     if (onp != NULL) {
6445       if (is_nuc)
6446       {
6447         onp->gcode = gcode;
6448       }
6449       else
6450       {
6451         onp->mgcode = gcode;
6452       }
6453     }
6454   }
6455   return biop;
6456 }
6457 
6458 static BioSourcePtr
6459 SetGeneticCodeFromTitle 
6460 (BioSourcePtr biop,
6461  CharPtr      title, 
6462  CharPtr      mod_name, 
6463  Boolean      is_nuc)
6464 {
6465   CharPtr    gcode_str;
6466   Int4       gcode;
6467   CharPtr    next_org_loc;
6468   
6469   if (StringHasNoText (title))
6470   {
6471     return biop;
6472   }
6473   
6474   next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6475   gcode_str = FindValueFromPairInDeflineBeforeCharPtr (mod_name, title, next_org_loc);
6476   if (!StringHasNoText (gcode_str))
6477   {
6478     gcode = GeneticCodeFromString (gcode_str);  
6479     biop = SetGeneticCodeForBioSource (biop, gcode, is_nuc);     
6480   }
6481   if (gcode_str != NULL)
6482   {
6483     RemoveValueFromDefline (mod_name, title);
6484   }
6485   gcode_str = MemFree (gcode_str);
6486   return biop;
6487 }
6488 
6489 static BioSourcePtr 
6490 SetAllGeneticCodesFromTitle 
6491 (BioSourcePtr biop,
6492  CharPtr      title)
6493 {
6494   Int4    code_to_use;
6495   CharPtr location;
6496   CharPtr next_org_loc;
6497   
6498   if (StringHasNoText (title))
6499   {
6500     return biop;
6501   }
6502   
6503   next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6504   location = FindValueFromPairInDeflineBeforeCharPtr ("location", title, next_org_loc);
6505   if (!StringHasNoText (location)) 
6506   {      
6507     code_to_use = UseGeneticCodeForLocation (location);
6508     if (code_to_use == USE_OTHER_GENETIC_CODE)
6509     {
6510       biop = SetGeneticCodeForBioSource (biop, 11, TRUE); 
6511       RemoveValueFromDefline ("genetic_code", title);    
6512     }
6513     else if (code_to_use == USE_NUCLEAR_GENETIC_CODE)
6514     {
6515       biop = SetGeneticCodeFromTitle (biop, title, "genetic_code", TRUE);
6516     }
6517     else if (code_to_use == USE_MITOCHONDRIAL_GENETIC_CODE)
6518     {
6519       biop = SetGeneticCodeFromTitle (biop, title, "genetic_code", FALSE);
6520     }
6521   }
6522   location = MemFree (location);
6523   
6524   biop = SetGeneticCodeFromTitle (biop, title, "gcode", TRUE);
6525   biop = SetGeneticCodeFromTitle (biop, title, "mgcode", FALSE);
6526 
6527   return biop;
6528 }
6529 
6530 static void 
6531 SetMoleculeAndMolTypeFromTitle 
6532 (BioseqPtr   bsp, 
6533  CharPtr     title,
6534  Int2        seqPackage)
6535 {
6536   SeqEntryPtr sep;
6537   ValNodePtr vnp;
6538   MolInfoPtr mip = NULL;
6539   Uint1      biomol;
6540   Int4       molecule;
6541   CharPtr    valstr;
6542   CharPtr    ptr;
6543   SeqLocPtr  slp;
6544   BioseqPtr  bsp_seg;
6545   
6546   if (bsp == NULL)
6547   {
6548     return;
6549   }
6550   
6551   sep = SeqMgrGetSeqEntryForData (bsp); 
6552   if (sep == NULL)
6553   {
6554     return;
6555   }
6556     
6557   vnp = SeqEntryGetSeqDescr (sep, Seq_descr_molinfo, NULL);
6558   if (vnp == NULL)
6559   {
6560     if (seqPackage == SEQ_PKG_SINGLE)
6561     {
6562       biomol = 3;
6563       molecule = Seq_mol_rna;
6564     }
6565     else 
6566     {
6567       biomol = 1;
6568       molecule = Seq_mol_dna;
6569     }
6570   }
6571   else
6572   {
6573     mip = (MolInfoPtr) vnp->data.ptrvalue;
6574     biomol = mip->biomol;
6575     molecule = bsp->mol;
6576   }
6577   
6578   /* get moltype from defline */
6579   valstr = FindValueFromPairInDefline ("moltype", title);
6580   if (!StringHasNoText (valstr))
6581   {
6582     biomol = MolTypeFromString (valstr);
6583     if (biomol == 1)
6584     {
6585       molecule = Seq_mol_na;
6586     }
6587     else if (biomol >= 2 && biomol <= 7)
6588     {
6589       molecule = Seq_mol_rna;
6590     }
6591     else if (biomol == 9)
6592     {
6593       molecule = Seq_mol_dna;
6594     }
6595     else if (biomol == 253)
6596     {
6597       molecule = Seq_mol_dna;
6598       biomol = 1;
6599     }
6600     else if (biomol == 254)
6601     {
6602       molecule = Seq_mol_rna;
6603       biomol = 1;
6604     }
6605     else if (biomol == 255)
6606     {
6607       molecule = Seq_mol_other;
6608     }
6609   }
6610   valstr = MemFree (valstr);
6611   
6612   RemoveValueFromDefline ("moltype", title);
6613 
6614   /* get molecule from defline */ 
6615   valstr = FindValueFromPairInDefline ("molecule", title);
6616   if (!StringHasNoText (valstr))
6617   {
6618     if (StringICmp (valstr, "dna") == 0) {
6619       molecule = Seq_mol_dna;
6620     } else if (StringICmp (valstr, "rna") == 0) {
6621       molecule = Seq_mol_rna;
6622     }
6623   }
6624   valstr = MemFree (valstr);
6625   RemoveValueFromDefline ("molecule", title);
6626   
6627   ptr = StringISearch (title, "[dna]");
6628   if (ptr != NULL)
6629   {
6630     molecule = Seq_mol_dna;
6631     ExciseString (title, "[dna", "]");
6632   }
6633   
6634   ptr = StringISearch (title, "[rna]");
6635   if (ptr != NULL)
6636   {
6637     molecule = Seq_mol_rna;
6638     ExciseString (title, "[rna", "]");
6639   }
6640   
6641   if (mip == NULL)
6642   {
6643     vnp = CreateNewDescriptor (sep, Seq_descr_molinfo);
6644     mip = MolInfoNew ();
6645     vnp->data.ptrvalue = mip;
6646   }
6647      
6648   mip->biomol = biomol;
6649   bsp->mol = molecule;  
6650 
6651   valstr = FindValueFromPairInDefline ("tech", title);
6652   if (!StringHasNoText (valstr))
6653   {
6654     ReadTechFromString (valstr, mip);
6655   }
6656   valstr = MemFree (valstr);
6657   RemoveValueFromDefline ("tech", title);
6658   
6659   if (bsp->repr == Seq_repr_seg)
6660   {
6661     slp = (SeqLocPtr) bsp->seq_ext;
6662     while (slp != NULL)
6663     {
6664       bsp_seg = BioseqFind (SeqLocId (slp));
6665       sep = SeqMgrGetSeqEntryForData (bsp_seg);
6666       if (bsp_seg != NULL)
6667       {
6668         bsp_seg->mol = bsp->mol;
6669       }
6670       vnp = SeqEntryGetSeqDescr (sep, Seq_descr_molinfo, NULL);
6671       if (vnp == NULL)
6672       {
6673         vnp = CreateNewDescriptor (sep, Seq_descr_molinfo);
6674       }
6675       if (vnp != NULL)
6676       {
6677         vnp->data.ptrvalue = MolInfoFree (vnp->data.ptrvalue);
6678         vnp->data.ptrvalue = (MolInfoPtr) AsnIoMemCopy (mip, (AsnReadFunc) MolInfoAsnRead,
6679                                                             (AsnWriteFunc) MolInfoAsnWrite);
6680       }
6681       slp = slp->next;
6682     }
6683   }
6684 }
6685 
6686 static void AddGeneticCodeComment (BioseqPtr bsp, CharPtr comment)
6687 {
6688   SeqDescPtr         sdp;
6689   UserObjectPtr      uop = NULL;
6690   ObjectIdPtr        oip;
6691   UserFieldPtr       ufp, last_ufp = NULL;
6692   CharPtr            comment_fmt = "Submitter genetic code: %s";
6693   CharPtr            new_comment;
6694   Int4               new_comment_len;
6695 
6696   if (bsp == NULL || StringHasNoText (comment))
6697   {
6698     return;
6699   }
6700   
6701   sdp = bsp->descr;
6702   while (sdp != NULL && uop == NULL)
6703   {
6704     if (sdp->choice == Seq_descr_user && sdp->data.ptrvalue != NULL)
6705     {
6706       uop = (UserObjectPtr) sdp->data.ptrvalue;
6707       oip = uop->type;
6708       if (oip == NULL || StringCmp (oip->str, "Submission") != 0)
6709       {
6710         uop = NULL;
6711       }
6712     }
6713     sdp = sdp->next;
6714   }
6715   
6716   
6717   if (uop == NULL)
6718   {
6719     uop = UserObjectNew ();
6720     if (uop == NULL)
6721     {
6722       return;
6723     }
6724     uop->type = ObjectIdNew ();
6725     uop->type->str = StringSave ("Submission");
6726     ValNodeAddPointer (&bsp->descr, Seq_descr_user, uop);  
6727   }
6728   
6729   ufp = uop->data;
6730   while (ufp != NULL 
6731          && (ufp->label == NULL 
6732            || StringCmp (ufp->label->str, "AdditionalComment") != 0))
6733   {
6734     last_ufp = ufp;
6735     ufp = ufp->next;
6736   }
6737   
6738   if (ufp == NULL)
6739   {
6740     ufp = UserFieldNew ();
6741     ufp->label = ObjectIdNew ();
6742     ufp->label->str = StringSave ("AdditionalComment");
6743     if (last_ufp == NULL)
6744     {
6745       uop->data = ufp;
6746     }
6747     else
6748     {
6749       last_ufp->next = ufp;
6750     }
6751   }
6752   
6753   new_comment_len = StringLen (comment) + StringLen (comment_fmt);
6754   if (!StringHasNoText (ufp->data.ptrvalue))
6755   {
6756     new_comment_len += StringLen (ufp->data.ptrvalue);
6757   }
6758   new_comment = (CharPtr) MemNew (new_comment_len * sizeof (Char));
6759   sprintf (new_comment, comment_fmt, comment);
6760   
6761   if (!StringHasNoText (ufp->data.ptrvalue))
6762   {
6763     StringCat (new_comment, ufp->data.ptrvalue);
6764   }
6765   
6766   ufp->data.ptrvalue = MemFree (ufp->data.ptrvalue);
6767   ufp->data.ptrvalue = new_comment;
6768 }
6769 
6770 static BioSourcePtr AddOrgModValue (BioSourcePtr biop, Uint1 subtype, CharPtr subname)
6771 {
6772   OrgModPtr    mod;
6773   
6774   if (subname == NULL)
6775   {
6776     return biop;
6777   }
6778 
6779   biop = AddOrgName (biop);
6780   if (biop != NULL)
6781   {
6782     mod = OrgModNew ();
6783     if (mod != NULL)
6784     {
6785       mod->subtype = subtype;
6786       mod->subname = subname;
6787       subname = NULL;
6788       mod->next = biop->org->orgname->mod;
6789       biop->org->orgname->mod = mod;
6790     }
6791   }
6792   subname = MemFree (subname);
6793   return biop;
6794 }
6795 
6796 static BioSourcePtr AddSubSourceValue (BioSourcePtr biop, Uint1 subtype, CharPtr subname)
6797 {
6798   SubSourcePtr ssp;
6799   
6800   if (subname == NULL)
6801   {
6802     return biop;
6803   }
6804 
6805   if (biop == NULL)
6806   {
6807     biop = BioSourceNew ();
6808   }
6809   if (biop != NULL)
6810   {
6811     ssp = SubSourceNew ();
6812     if (ssp != NULL)
6813     {
6814       ssp->subtype = subtype;
6815       ssp->name = subname;
6816       subname = NULL;
6817       ssp->next = biop->subtype;
6818       biop->subtype = ssp;
6819     }
6820   }
6821   subname = MemFree (subname);
6822   return biop;
6823 }
6824 
6825 extern BioSourcePtr 
6826 ExtractFromTitleToBioSourceOrgMod 
6827 (CharPtr      title,
6828  BioSourcePtr biop, 
6829  CharPtr      mod_name,
6830  Int4         subtype)
6831 {
6832   CharPtr valstr;
6833   CharPtr next_org_loc;
6834   
6835   next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6836   while ((valstr = FindValueFromPairInDeflineBeforeCharPtr (mod_name, title, next_org_loc)) != NULL)
6837   {
6838     biop = AddOrgModValue (biop, subtype, valstr);
6839     RemoveValueFromDefline (mod_name, title);
6840     next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6841   }  
6842   return biop;
6843 }
6844 
6845 extern BioSourcePtr 
6846 ExtractFromTitleToBioSourceSubSource 
6847 (CharPtr      title,
6848  BioSourcePtr biop, 
6849  CharPtr      mod_name,
6850  Int4         subtype)
6851 {
6852   CharPtr valstr;
6853   CharPtr next_org_loc;
6854   
6855   next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6856   while ((valstr = FindValueFromPairInDeflineBeforeCharPtr (mod_name, title, next_org_loc)) != NULL)
6857   {
6858     if (IsNonTextModifier (mod_name)) {
6859       if (StringICmp (valstr, "FALSE") == 0) {
6860         valstr = MemFree (valstr);
6861       } else if (StringICmp (valstr, "TRUE") == 0) {
6862         biop = AddSubSourceValue (biop, subtype, StringSave (""));
6863         valstr = MemFree (valstr);
6864       } else {
6865         biop = AddSubSourceValue (biop, subtype, valstr);
6866       }
6867     } else {
6868       biop = AddSubSourceValue (biop, subtype, valstr);
6869     }
6870     RemoveValueFromDefline (mod_name, title);
6871     next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6872   }  
6873   return biop;
6874 }
6875 
6876 /* this function collects all of the common names prior to the next organism name
6877  * and assembles a semicolon-delimited list.
6878  */
6879 extern BioSourcePtr 
6880 ExtractFromTitleToBioSourceCommonName 
6881 (CharPtr      title,
6882  BioSourcePtr biop)
6883 {
6884   CharPtr valstr, new_val;
6885   Int4    new_len;
6886   CharPtr next_org_loc;
6887   
6888   next_org_loc = FindValuePairInDefLine ("organism", title, NULL);  
6889   while ((valstr = FindValueFromPairInDeflineBeforeCharPtr ("common name", title, next_org_loc)) != NULL)
6890   {
6891     if (!StringHasNoText (valstr))
6892     {
6893       biop = AddOrgRef (biop);
6894       if (StringHasNoText (biop->org->common))
6895       {
6896         biop->org->common = MemFree (biop->org->common);
6897         biop->org->common = valstr;
6898         valstr = NULL;
6899       }
6900       else
6901       {
6902         new_len = StringLen (biop->org->common) + StringLen (valstr) + 3;
6903         new_val = (CharPtr) MemNew (new_len * sizeof (Char));
6904         if (new_val != NULL)
6905         {
6906           sprintf (new_val, "%s; %s", biop->org->common, valstr);
6907           biop->org->common = MemFree (biop->org->common);
6908           biop->org->common = new_val;
6909         }
6910       }
6911     }
6912     valstr = MemFree (valstr);
6913     RemoveValueFromDefline ("common name", title);
6914     next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6915   }  
6916   return biop;
6917 }
6918 
6919 /* When the user specifies multiple organisms on the definition line, modifiers after the
6920  * second organism go with the second organism, after the third organism go with the third
6921  * organism, etc.
6922  */
6923 extern BioSourcePtr ExtractFromDeflineToBioSource (CharPtr defline, BioSourcePtr biop)
6924 {
6925   CharPtr      taxname = NULL;
6926   OrgInfoPtr   oip = NULL;
6927   CharPtr      valstr;
6928   Nlm_EnumFieldAssocPtr ap;
6929   Nlm_QualNameAssocPtr  qp;
6930   CharPtr            next_org_loc;
6931   
6932   if (StringHasNoText (defline))
6933   {
6934     return NULL;
6935   }
6936   
6937   taxname = FindValueFromPairInDefline ("organism", defline);
6938   RemoveValueFromDefline ("organism", defline);
6939   if (StringHasNoText (taxname))
6940   {
6941     taxname = MemFree (taxname);
6942     return NULL;
6943   }
6944   else
6945   {
6946     biop = AddOrgRef (biop);
6947     if (biop == NULL)
6948     {
6949       return biop;
6950     }
6951     LoadOrganismList ();
6952     oip = FindByTaxName (taxname);
6953     SetTaxNameAndRemoveTaxRef (biop->org, taxname);
6954   }
6955   
6956   /* add division */
6957   if (oip != NULL && !StringHasNoText (oip->div))
6958   {
6959     biop = AddOrgName (biop);
6960     if (biop == NULL)
6961     {
6962       return biop;
6963     }    
6964     biop->org->orgname->div = StringSave (oip->div);
6965   }
6966   
6967   /* add common name (s) - if there are multiple entries, separate with semicolon */
6968   biop = ExtractFromTitleToBioSourceCommonName (defline, biop);
6969   /* if common name was not supplied in defline, use common name from organism list */
6970   if (biop->org == NULL || StringHasNoText (biop->org->common))
6971   {
6972     if (oip != NULL && !StringHasNoText (oip->common))
6973     {
6974       biop = AddOrgRef (biop);
6975       if (biop == NULL)
6976       {
6977         return biop;
6978       }
6979       biop->org->common = StringSave (oip->common);
6980     }
6981   }
6982   
6983   /* add lineage */
6984   if (oip != NULL && !StringHasNoText (oip->lineage))
6985   {
6986     biop = AddOrgName (biop);
6987     if (biop == NULL)
6988     {
6989       return biop;
6990     }
6991     biop->org->orgname->lineage = StringSave (oip->lineage);
6992   }
6993   
6994   /* add origin */
6995   next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
6996   valstr = FindValueFromPairInDeflineBeforeCharPtr ("origin", defline, next_org_loc);
6997   if (!StringHasNoText (valstr))
6998   {
6999     for (ap = biosource_origin_alist; ap->name != NULL; ap++) {
7000       if (StringICmp (valstr, ap->name) == 0) {
7001         if (biop == NULL)
7002         {
7003           biop = BioSourceNew ();
7004         }
7005         if (biop == NULL)
7006         {
7007           return biop;
7008         }
7009         biop->origin = (Uint1) ap->value;
7010       }
7011     }
7012   }
7013   if (valstr != NULL)
7014   {
7015     RemoveValueFromDefline ("origin", defline);
7016     next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7017   }
7018   valstr = MemFree (valstr);
7019   
7020   valstr = FindValueFromPairInDeflineBeforeCharPtr ("lineage", defline, next_org_loc);
7021   if (!StringHasNoText (valstr))
7022   {
7023     biop = AddOrgName (biop);
7024   }
7025   if (!StringHasNoText (valstr) && StringCmp (valstr, biop->org->orgname->lineage) != 0)
7026   {
7027     biop = AddOrgModValue (biop, ORGMOD_old_lineage, valstr);
7028     valstr = NULL;
7029   }
7030   if (valstr != NULL)
7031   {
7032     RemoveValueFromDefline ("lineage", defline);
7033   }
7034   valstr = MemFree (valstr);
7035   
7036   biop = SetAllGeneticCodesFromTitle (biop, defline);
7037   next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7038         
7039   for (qp = current_orgmod_subtype_alist; qp->name != NULL; qp++) {
7040     biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, qp->name, qp->value);
7041   }
7042   for (qp = current_subsource_subtype_alist; qp->name != NULL; qp++) {
7043     biop = ExtractFromTitleToBioSourceSubSource (defline, biop, qp->name, qp->value);
7044   }
7045   
7046   biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "note-orgmod", 255);
7047   biop = ExtractFromTitleToBioSourceSubSource (defline, biop, "note-subsrc", 255);
7048   biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "note", 255);
7049   biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "comment", 255);
7050   biop = ExtractFromTitleToBioSourceSubSource (defline, biop, "subsource", 255);
7051 
7052 
7053   next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7054 
7055   /* set location */
7056   valstr = FindValueFromPairInDeflineBeforeCharPtr ("location", defline, next_org_loc);
7057   if (StringHasNoText (valstr))
7058   {
7059     if (biop == NULL)
7060     {
7061       biop = BioSourceNew ();
7062     }
7063     if (biop == NULL)
7064     {
7065       return biop;
7066     }
7067     biop->genome = 1;
7068   }
7069   else if (StringICmp (valstr, "Mitochondrial") == 0)
7070   {
7071     if (biop == NULL)
7072     {
7073       biop = BioSourceNew ();
7074     }
7075     if (biop == NULL)
7076     {
7077       return biop;
7078     }
7079     biop->genome = 5;
7080   }
7081   else
7082   {
7083     for (ap = biosource_genome_simple_alist; ap->name != NULL; ap++) {
7084       if (StringICmp (valstr, ap->name) == 0) {
7085         if (biop == NULL)
7086         {
7087           biop = BioSourceNew ();
7088         }
7089         if (biop == NULL)
7090         {
7091           return biop;
7092         }
7093         biop->genome = (Uint1) ap->value;
7094       }
7095     }
7096   }
7097   if (valstr != NULL)
7098   {
7099     RemoveValueFromDefline ("location", defline);
7100   }
7101   valstr = MemFree (valstr);
7102  
7103   TrimSpacesAroundString (defline);
7104   
7105   return biop;
7106   
7107 }
7108 
7109 extern Boolean ProcessOneNucleotideTitle (Int2 seqPackage,
7110                                           SeqEntryPtr nsep, SeqEntryPtr top);
7111                                           
7112 
7113 static void ParseDeflineToBiop(CharPtr defline, BioSourcePtr biop) 
7114 {
7115   CharPtr      taxname = NULL;
7116   OrgInfoPtr   oip = NULL;
7117   CharPtr      valstr;
7118   EnumFieldAssocPtr  ap;
7119   Nlm_QualNameAssocPtr qp;
7120   CharPtr            next_org_loc;
7121   
7122   if (StringHasNoText (defline) || biop == NULL)
7123   {
7124     return;
7125   }
7126   
7127   taxname = FindValueFromPairInDefline ("organism", defline);
7128   RemoveValueFromDefline ("organism", defline);
7129   if (StringHasNoText (taxname))
7130   {
7131     taxname = MemFree (taxname);
7132   }
7133   else
7134   {
7135     biop = AddOrgRef (biop);
7136     biop->org->taxname = taxname;
7137     LoadOrganismList ();
7138     oip = FindByTaxName (taxname);
7139   }
7140   
7141   /* add division */
7142   if (oip != NULL && !StringHasNoText (oip->div))
7143   {
7144     biop = AddOrgName (biop);
7145     biop->org->orgname->div = StringSave (oip->div);
7146   }
7147   
7148   /* add common name (s) - if there are multiple entries, separate with semicolon */
7149   biop = ExtractFromTitleToBioSourceCommonName (defline, biop);
7150   /* if common name was not supplied in defline, use common name from organism list */
7151   if (biop->org == NULL || StringHasNoText (biop->org->common))
7152   {
7153     if (oip != NULL && !StringHasNoText (oip->common))
7154     {
7155       biop = AddOrgRef (biop);
7156       biop->org->common = StringSave (oip->common);
7157     }
7158   }
7159   
7160   /* add lineage */
7161   if (oip != NULL && !StringHasNoText (oip->lineage))
7162   {
7163     biop = AddOrgName (biop);
7164     biop->org->orgname->lineage = StringSave (oip->lineage);
7165   }
7166   
7167   /* add origin */
7168   next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7169   valstr = FindValueFromPairInDeflineBeforeCharPtr ("origin", defline, next_org_loc);
7170   if (!StringHasNoText (valstr))
7171   {
7172     for (ap = biosource_origin_alist; ap->name != NULL; ap++) {
7173       if (StringICmp (valstr, ap->name) == 0) {
7174         biop->origin = (Uint1) ap->value;
7175       }
7176     }
7177   }
7178   if (valstr != NULL)
7179   {
7180     RemoveValueFromDefline ("origin", defline);
7181     next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7182   }
7183   valstr = MemFree (valstr);
7184   
7185   valstr = FindValueFromPairInDeflineBeforeCharPtr ("lineage", defline, next_org_loc);
7186   if (!StringHasNoText (valstr))
7187   {
7188     biop = AddOrgName (biop);
7189   }
7190   if (!StringHasNoText (valstr) && StringCmp (valstr, biop->org->orgname->lineage) != 0)
7191   {
7192     biop = AddOrgModValue (biop, ORGMOD_old_lineage, valstr);
7193     valstr = NULL;
7194   }
7195   if (valstr != NULL)
7196   {
7197     RemoveValueFromDefline ("lineage", defline);
7198   }
7199   valstr = MemFree (valstr);
7200   
7201   biop = SetAllGeneticCodesFromTitle (biop, defline);
7202   next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7203         
7204   for (qp = current_orgmod_subtype_alist; qp->name != NULL; qp++) {
7205     biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, qp->name, qp->value);
7206   }
7207   for (qp = current_subsource_subtype_alist; qp->name != NULL; qp++) {
7208     biop = ExtractFromTitleToBioSourceSubSource (defline, biop, qp->name, qp->value);
7209   }
7210   
7211   biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "note-orgmod", 255);
7212   biop = ExtractFromTitleToBioSourceSubSource (defline, biop, "note-subsrc", 255);
7213   biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "note", 255);
7214   biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "comment", 255);
7215   biop = ExtractFromTitleToBioSourceSubSource (defline, biop, "subsource", 255);
7216 
7217 
7218   next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7219 
7220   /* set location */
7221   valstr = FindValueFromPairInDeflineBeforeCharPtr ("location", defline, next_org_loc);
7222   if (StringHasNoText (valstr))
7223   {
7224     /* don't set defaults */
7225   }
7226   else if (StringICmp (valstr, "Mitochondrial") == 0)
7227   {
7228     biop->genome = 5;
7229   }
7230   else
7231   {
7232     for (ap = biosource_genome_simple_alist; ap->name != NULL; ap++) {
7233       if (StringICmp (valstr, ap->name) == 0) {
7234         biop->genome = (Uint1) ap->value;
7235       }
7236     }
7237   }
7238   if (valstr != NULL)
7239   {
7240     RemoveValueFromDefline ("location", defline);
7241   }
7242   valstr = MemFree (valstr);
7243  
7244   TrimSpacesAroundString (defline);
7245 }
7246 
7247 
7248 static void ParseModifiersFromDeflineCallback (BioseqPtr bsp, Pointer userdata)
7249 {
7250   CharPtr           title;
7251   SeqDescrPtr       sdp, sdp_biop, prev_sdp = NULL;
7252   BioSourcePtr      biop = NULL;
7253   CharPtr           valstr;
7254   SeqMgrDescContext context;
7255   
7256   if (bsp == NULL) return;
7257   
7258   if (ISA_aa(bsp->mol)) {
7259     return;
7260   }
7261   
7262   sdp = bsp->descr;
7263   while (sdp != NULL && sdp->choice != Seq_descr_title) {
7264     prev_sdp = sdp;
7265     sdp = sdp->next;
7266   }
7267   if (sdp == NULL || sdp->data.ptrvalue == NULL) {
7268     return;
7269   }
7270   
7271   title = sdp->data.ptrvalue;
7272   
7273   if (StringChr(title, '[') == NULL || StringChr(title, ']') == NULL) {
7274     return;
7275   }
7276   
7277   /* parse moltype values */
7278   SetMoleculeAndMolTypeFromTitle (bsp, title, SEQ_PKG_GENBANK);
7279   
7280   /* get topology from defline */
7281   valstr = FindValueFromPairInDefline ("topology", title);
7282   if (valstr != NULL)
7283   {
7284     if (!StringHasNoText (valstr))
7285     {
7286       bsp->topology = TopologyFromString (valstr);
7287     }
7288     RemoveValueFromDefline ("topology", title);
7289     valstr = MemFree (valstr);
7290   }
7291   
7292   /* add bankit comment for genetic code */
7293   valstr = FindValueFromPairInDefline ("gencode_comment", title);
7294   if (valstr != NULL)
7295   {
7296     AddGeneticCodeComment (bsp, valstr);
7297     RemoveValueFromDefline ("gencode_comment", title);
7298     valstr = MemFree (valstr);
7299   }
7300   
7301   sdp_biop = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
7302 
7303   if(sdp_biop == NULL) {
7304     if (bsp->idx.parenttype == OBJ_BIOSEQSET 
7305         && bsp->idx.parentptr != NULL 
7306         && ((BioseqSetPtr)bsp->idx.parentptr)->_class == BioseqseqSet_class_parts) {
7307         /* don't put sources on parts */
7308     } else {
7309         biop = BioSourceNew();
7310         sdp_biop = SeqDescrNew(bsp->descr);
7311         sdp_biop->choice = Seq_descr_source;
7312         sdp_biop->data.ptrvalue = biop;
7313     }
7314   } else {
7315     biop = sdp_biop->data.ptrvalue;
7316   }          
7317   
7318   ParseDeflineToBiop (title, biop);
7319   
7320   if (StringHasNoText (title)) {
7321     /* remove empty defline */
7322     sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
7323     if (prev_sdp == NULL) {
7324       bsp->descr = sdp->next;
7325     } else {
7326       prev_sdp->next = sdp->next;
7327     }
7328     sdp->next = NULL;
7329     sdp = SeqDescrFree (sdp);
7330   }
7331 }
7332 
7333                                          
7334 extern void ParseModifiersFromDefline (IteM i)
7335 {
7336   BaseFormPtr  bfp;
7337   SeqEntryPtr  sep;
7338 
7339 #ifdef WIN_MAC
7340   bfp = currentFormDataPtr;
7341 #else
7342   bfp = GetObjectExtra (i);
7343 #endif
7344   if (bfp == NULL) return;
7345   sep = GetTopSeqEntryForEntityID (bfp->input_entityID);
7346   if (sep == NULL) return;
7347   
7348   VisitBioseqsInSep (sep, NULL, ParseModifiersFromDeflineCallback);
7349   Update ();
7350   ObjMgrSetDirtyFlag (bfp->input_entityID, TRUE);
7351   ObjMgrSendMsg (OM_MSG_UPDATE, bfp->input_entityID, 0, 0);  
7352 }
7353                                     
7354                                           
7355 extern Boolean ProcessOneNucleotideTitle (Int2 seqPackage, 
7356                                           SeqEntryPtr nsep, SeqEntryPtr top)
7357 
7358 {
7359   BioSourcePtr       biop = NULL;
7360   BioseqSetPtr       bssp;
7361   BioseqPtr          nbsp;
7362   Boolean            needbiop;
7363   SeqEntryPtr        sep;
7364   CharPtr            str;
7365   CharPtr            valstr;
7366   CharPtr            title;
7367   ValNodePtr         vnp;
7368   Int4               topology;
7369 #if 0
7370   SeqFeatPtr         sfp;
7371 #endif  
7372 
7373   if (nsep == NULL || top == NULL) return FALSE;
7374   nbsp = (BioseqPtr) nsep->data.ptrvalue;
7375   if (nbsp == NULL) return FALSE;
7376   if (! ISA_na (nbsp->mol)) return FALSE;
7377   str = MemNew (PROC_NUC_STR_SIZE * sizeof (Char));
7378   if (str == NULL) return FALSE;
7379   sep = NULL;
7380  
7381   SeqEntryExplore (top, (Pointer) &sep, FindFirstSeqEntryTitle);
7382   sep = FindNucSeqEntry (sep);
7383   if (sep != NULL) {
7384     vnp = SeqEntryGetSeqDescr (sep, Seq_descr_title, NULL);
7385     if (vnp != NULL && vnp->data.ptrvalue != NULL) {
7386       title = (CharPtr) vnp->data.ptrvalue;
7387       
7388       SetMoleculeAndMolTypeFromTitle (nbsp, title, seqPackage);
7389 
7390       if (nbsp->topology == 0)
7391       {
7392         topology = TOPOLOGY_LINEAR;
7393       }
7394       else
7395       {
7396         topology = nbsp->topology;
7397       }
7398   
7399       /* get topology from defline */
7400       valstr = FindValueFromPairInDefline ("topology", title);
7401       if (valstr != NULL)
7402       {
7403         if (!StringHasNoText (valstr))
7404         {
7405           topology = TopologyFromString (valstr);
7406         }
7407         RemoveValueFromDefline ("topology", title);
7408         valstr = MemFree (valstr);
7409       }
7410       nbsp->topology = topology;
7411       
7412       /* add bankit comment for genetic code */
7413       valstr = FindValueFromPairInDefline ("gencode_comment", title);
7414       if (valstr != NULL)
7415       {
7416         AddGeneticCodeComment (nbsp, valstr);
7417         RemoveValueFromDefline ("gencode_comment", title);
7418         valstr = MemFree (valstr);
7419       }
7420 
7421       needbiop = FALSE;
7422       
7423       if (PackageTypeIsSet (seqPackage)
7424           || seqPackage == SEQ_PKG_GENBANK)
7425       {
7426         needbiop = TRUE;
7427         if (GetAppParam ("SEQUIN", "PREFERENCES", "BIOSRCONALL", NULL, str, PROC_NUC_STR_SIZE)) {
7428           if (StringICmp (str, "FALSE") == 0) {
7429             needbiop = FALSE;
7430           }
7431         }
7432       }
7433       
7434       vnp = SeqEntryGetSeqDescr (sep, Seq_descr_source, NULL);
7435       if (vnp == NULL)
7436       {
7437         biop = ExtractFromDeflineToBioSource (title, NULL);
7438         if (biop == NULL && needbiop)
7439         {
7440           biop = BioSourceNew ();
7441         }
7442 
7443         if (biop != NULL)
7444         {
7445           vnp = CreateNewDescriptor (top, Seq_descr_source);
7446           if (vnp != NULL) {
7447             vnp->data.ptrvalue = (Pointer) biop;
7448           }
7449         }
7450 #if 0        
7451         biop = BioSourceFromDefline (title);
7452         while (biop != NULL)
7453         {
7454           sfp = CreateNewFeature (sep, NULL, SEQFEAT_BIOSRC, NULL);
7455           if (sfp != NULL)
7456           {
7457             sfp->data.value.ptrvalue = biop;
7458           }
7459           biop = BioSourceFromDefline (title);
7460         }
7461 #endif        
7462       }
7463 
7464       if (StringHasNoText (title) || sep != top) {
7465         vnp = NULL;
7466         if (IS_Bioseq (sep)) {
7467           nbsp = (BioseqPtr) sep->data.ptrvalue;
7468           vnp = ValNodeExtract (&(nbsp->descr), Seq_descr_title);
7469         } else if (IS_Bioseq_set (sep)) {
7470           bssp = (BioseqSetPtr) sep->data.ptrvalue;
7471           vnp = ValNodeExtract (&(bssp->descr), Seq_descr_title);
7472         }
7473         if (vnp != NULL && StringHasNoText ((CharPtr) vnp->data.ptrvalue)) {
7474           vnp = ValNodeFreeData (vnp);
7475         }
7476         if (sep != top && vnp != NULL) {
7477           if (IS_Bioseq (top)) {
7478             nbsp = (BioseqPtr) top->data.ptrvalue;
7479             ValNodeLink (&(nbsp->descr), vnp);
7480           } else if (IS_Bioseq_set (top)) {
7481             bssp = (BioseqSetPtr) top->data.ptrvalue;
7482             ValNodeLink (&(bssp->descr), vnp);
7483           }
7484         }
7485       }
7486     }
7487   } else {
7488     needbiop = FALSE;
7489     if (PackageTypeIsSet (seqPackage)
7490         || seqPackage == SEQ_PKG_GENOMICCDNA)
7491     {
7492       needbiop = TRUE;
7493       if (GetAppParam ("SEQUIN", "PREFERENCES", "BIOSRCONALL", NULL, str, PROC_NUC_STR_SIZE)) {
7494         if (StringICmp (str, "FALSE") == 0) {
7495           needbiop = FALSE;
7496         }
7497       }
7498     }
7499   }
7500   MemFree (str);
7501   
7502   return TRUE;
7503 }
7504 
7505 static Boolean AutomaticNucleotideProcess (SequencesFormPtr sqfp, SeqEntryPtr nsep,
7506                                            SeqEntryPtr top)
7507 
7508 {
7509   BioseqSetPtr  bssp;
7510   Boolean       rsult;
7511   SeqEntryPtr   tmp;
7512 
7513   if (sqfp == NULL || nsep == NULL || top == NULL) return FALSE;
7514   if (IS_Bioseq_set (nsep)) {
7515     bssp = (BioseqSetPtr) nsep->data.ptrvalue;
7516     rsult = FALSE;
7517     if (bssp != NULL) {
7518       for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
7519         if (AutomaticNucleotideProcess (sqfp, tmp, top)) {
7520           rsult = TRUE;
7521         }
7522       }
7523     }
7524     return rsult;
7525   }
7526   return ProcessOneNucleotideTitle (sqfp->seqPackage, 
7527                                     nsep, top);
7528 }
7529 
7530 typedef struct idlist {
7531   BioseqPtr  bsp;
7532   CharPtr    key;
7533   struct idlist PNTR left;
7534   struct idlist PNTR right;
7535 } IdList, PNTR IdListPtr;
7536 
7537 static void BuildTree (IdListPtr PNTR head, BioseqPtr bsp, CharPtr x)
7538 
7539 {
7540   Int2       comp;
7541   IdListPtr  idlist;
7542   SeqIdPtr   sip;
7543   CharPtr    str;
7544 
7545   if (*head != NULL) {
7546     idlist = *head;
7547     comp = StringICmp (idlist->key, x);
7548     if (comp < 0) {
7549       BuildTree (&(idlist->right), bsp, x);
7550     } else if (comp > 0) {
7551       BuildTree (&(idlist->left), bsp, x);
7552     } else {
7553       sip = MakeNewProteinSeqId (NULL, NULL);
7554       if (sip != NULL) {
7555         bsp->id = SeqIdFree (bsp->id);
7556         bsp->id = sip;
7557         SeqMgrReplaceInBioseqIndex (bsp);
7558         str = SeqIdWholeLabel (SeqIdFindWorst (bsp->id), PRINTID_REPORT);
7559         BuildTree (head, bsp, str);
7560         str = MemFree (str);
7561       }
7562     }
7563   } else {
7564     idlist = MemNew (sizeof (IdList));
7565     if (idlist != NULL) {
7566       *head = idlist;
7567       idlist->bsp = bsp;
7568       idlist->key = SeqIdWholeLabel (SeqIdFindWorst (bsp->id), PRINTID_REPORT);
7569       idlist->left = NULL;
7570       idlist->right = NULL;
7571     }
7572   }
7573 }
7574 
7575 static void FreeTree (IdListPtr PNTR head)
7576 
7577 {
7578   IdListPtr  idlist;
7579 
7580   if (head != NULL && *head != NULL) {
7581     idlist = *head;
7582     FreeTree (&(idlist->left));
7583     FreeTree (&(idlist->right));
7584     MemFree (idlist->key);
7585     MemFree (idlist);
7586   }
7587 }
7588 
7589 static void ResolveCollidingIDs (IdListPtr PNTR head, SeqEntryPtr list)
7590 
7591 {
7592   BioseqPtr  bsp;
7593   CharPtr    str;
7594 
7595   if (head == NULL) return;
7596   while (list != NULL) {
7597     if (IS_Bioseq (list)) {
7598       bsp = (BioseqPtr) list->data.ptrvalue;
7599       if (bsp != NULL) {
7600         str = SeqIdWholeLabel (SeqIdFindWorst (bsp->id), PRINTID_REPORT);
7601         BuildTree (head, bsp, str);
7602         str = MemFree (str);
7603       }
7604     }
7605     list = list->next;
7606   }
7607 }
7608 
7609 
7610 static void PutMolInfoOnSeqEntry (SequencesFormPtr sqfp, SeqEntryPtr sep)
7611 
7612 {
7613   BioseqSetPtr bssp;
7614   MolInfoPtr   mip;
7615   ValNodePtr   vnp;
7616 
7617   if (sqfp != NULL && sep != NULL) {
7618     if (IS_Bioseq_set (sep))
7619     {
7620       bssp = (BioseqSetPtr) sep->data.ptrvalue;
7621       for (sep = bssp->seq_set; sep != NULL; sep = sep->next) 
7622       {
7623         PutMolInfoOnSeqEntry (sqfp, sep);
7624       }
7625       return;
7626     }
7627 
7628     vnp = SeqEntryGetSeqDescr (sep, Seq_descr_molinfo, NULL);
7629     if (vnp == NULL)
7630     {
7631       vnp = CreateNewDescriptor (sep, Seq_descr_molinfo);
7632     }
7633     if (vnp != NULL)
7634     {
7635       mip = (MolInfoPtr) vnp->data.ptrvalue;
7636       if (mip == NULL)
7637       {
7638         mip = MolInfoNew ();
7639         vnp->data.ptrvalue = mip;
7640       }
7641     }
7642   }
7643 }
7644 
7645 static void PrefixOrgToDefline (SeqEntryPtr sep)
7646 
7647 {
7648   BioSourcePtr  biop;
7649   BioseqPtr     bsp;
7650   BioseqSetPtr  bssp;
7651   CharPtr       def;
7652   OrgRefPtr     orp;
7653   CharPtr       ptr;
7654   CharPtr       str;
7655   Char          taxname [64];
7656   ValNodePtr    ttl;
7657   ValNodePtr    vnp;
7658 
7659   if (sep == NULL) return;
7660   if (IS_Bioseq_set (sep)) {
7661     bssp = (BioseqSetPtr) sep->data.ptrvalue;
7662     if (bssp != NULL && (bssp->_class == 7 ||
7663                          (IsPopPhyEtcSet (bssp->_class)))) {
7664       for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
7665         PrefixOrgToDefline (sep);
7666       }
7667       return;
7668     }
7669   }
7670 
7671   if (! IS_Bioseq (sep)) return;
7672   bsp = (BioseqPtr) sep->data.ptrvalue;
7673   if (bsp == NULL) return;
7674 
7675   taxname [0] = '\0';
7676   orp = NULL;
7677   biop = NULL;
7678   ttl = NULL;
7679   vnp = bsp->descr;
7680   for (vnp = bsp->descr; vnp != NULL; vnp = vnp->next) {
7681     if (vnp->choice == Seq_descr_source) {
7682       biop = (BioSourcePtr) vnp->data.ptrvalue;
7683     } else if (vnp->choice == Seq_descr_org) {
7684       orp = (OrgRefPtr) vnp->data.ptrvalue;
7685     } else if (vnp->choice == Seq_descr_title) {
7686       ttl = vnp;
7687     }
7688   }
7689   if (orp == NULL && biop != NULL) {
7690     orp = biop->org;
7691   }
7692   if (orp == NULL) return;
7693   if (ttl == NULL) return;
7694   StringNCpy_0 (taxname, orp->taxname, sizeof (taxname));
7695   ptr = StringSearch (taxname, "(");
7696   if (ptr != NULL) {
7697     *ptr = '\0';
7698   }
7699   TrimSpacesAroundString (taxname);
7700   if ((StringICmp (taxname, "Human immunodeficiency virus type 1") == 0) ||
7701       (StringICmp (taxname, "Human immunodeficiency virus 1") == 0)) {
7702     StringCpy (taxname, "HIV-1");
7703   } else if ((StringICmp (taxname,"Human immunodeficiency virus type 2")==0) ||
7704              (StringICmp (taxname,"Human immunodeficiency virus 2")==0)) {
7705     StringCpy (taxname, "HIV-2");
7706   }
7707 
7708   def = (CharPtr) ttl->data.ptrvalue;
7709   if (StringHasNoText (def)) return;
7710 
7711   ptr = StringISearch (def, taxname);
7712   if (ptr != NULL && ptr == def) return;
7713   str = MemNew ((StringLen (taxname) + StringLen (def) + 4) * sizeof (Char));
7714   if (str == NULL) return;
7715   StringCpy (str, taxname);
7716   StringCat (str, " ");
7717   StringCat (str, def);
7718   ttl->data.ptrvalue = MemFree (ttl->data.ptrvalue);
7719   ttl->data.ptrvalue = str;
7720 }
7721 
7722 static CharPtr onecomponent = "\
7723 Multiple sequence components are expected in this submission.\n\
7724 They should all be read in at the same time from the same file.";
7725 
7726 static void OnlyOneComponentWarning (SequencesFormPtr sqfp)
7727 
7728 {
7729   CharPtr  type;
7730 
7731   if (sqfp != NULL) {
7732     if (sqfp->seqPackage == SEQ_PKG_GENOMICCDNA
7733         || PackageTypeIsSingle (sqfp->seqPackage))
7734     {
7735       return;
7736     }
7737     switch (sqfp->seqPackage) {
7738       case SEQ_PKG_SEGMENTED :
7739         type = "segmented sequence";
7740         break;
7741       case SEQ_PKG_POPULATION :
7742         type = "population set";
7743         break;
7744       case SEQ_PKG_PHYLOGENETIC :
7745         type = "phylogenetic set";
7746         break;
7747       case SEQ_PKG_MUTATION :
7748         type = "mutation set";
7749         break;
7750       case SEQ_PKG_ENVIRONMENT :
7751         type = "environmental samples";
7752         break;
7753       case SEQ_PKG_GENBANK :
7754         type = "batch submission";
7755         break;
7756       default :
7757         type = "unknown set";
7758         break;
7759     }
7760     Message (MSG_OK, "WARNING - There is only one component in this %s.\n%s",
7761              type, onecomponent);
7762   }
7763 }
7764 
7765 /*---------------------------------*/
7766 /* Parse the gene and gene-related */
7767 /* fields from the title.          */
7768 /*---------------------------------*/
7769 extern void 
7770 AddGeneFeatureFromTitle 
7771 (SeqEntryPtr nucsep,
7772  CharPtr ttl, 
7773  SeqLocPtr slp)
7774 {
7775   CharPtr    gene = NULL;
7776   CharPtr    gene_desc = NULL;
7777   CharPtr    allele = NULL;
7778   CharPtr    gene_syn = NULL;
7779   GeneRefPtr grp = NULL;
7780   SeqFeatPtr sfp;
7781   SeqIdPtr   sip;
7782   BioseqPtr  nbsp, bsp;
7783   SeqLocPtr  gslp;
7784   Boolean    hasNulls;
7785   
7786   if (nucsep == NULL || !IS_Bioseq (nucsep) 
7787       || (nbsp = (BioseqPtr) nucsep->data.ptrvalue) == NULL
7788       || StringHasNoText (ttl) || slp == NULL)
7789   {
7790     return;
7791   }
7792   
7793   gene = FindValueFromPairInDefline ("gene", ttl);
7794   if (!StringHasNoText (gene))
7795   {
7796     gene_desc = StringChr (gene, ';');
7797     if (gene_desc != NULL) {
7798       *gene_desc = '\0';
7799       gene_desc++;
7800       allele = StringChr (gene_desc, ';');
7801       if (allele != NULL) {
7802         *allele = '\0';
7803         allele++;
7804       }
7805     }
7806     grp = CreateNewGeneRef (gene, allele, gene_desc, FALSE);
7807   }
7808   gene = MemFree (gene);
7809   
7810   /*-----------------------------------------*/
7811   /* Parse the gene_syn field from the title */
7812   /*-----------------------------------------*/
7813   
7814   gene_syn = FindValueFromPairInDefline ("gene_syn", ttl);
7815   if (!StringHasNoText (gene_syn))
7816   {
7817     if (grp == NULL) {
7818       grp = GeneRefNew ();
7819     }
7820     ValNodeCopyStr(&(grp->syn),0,gene_syn);
7821   }
7822   gene_syn = MemFree (gene_syn);
7823 
7824   /* Create the gene feature */
7825   if (grp != NULL) {
7826     if (ExtendGene (grp, nucsep, slp)) {
7827       grp = GeneRefFree (grp);
7828     } else {
7829       sfp = CreateNewFeature (nucsep, NULL, SEQFEAT_GENE, NULL);
7830       if (sfp != NULL) {
7831         sfp->data.value.ptrvalue = (Pointer) grp;
7832         sfp->location = SeqLocFree (sfp->location);
7833         sfp->location = AsnIoMemCopy ((Pointer) slp,
7834                                       (AsnReadFunc) SeqLocAsnRead,
7835                                       (AsnWriteFunc) SeqLocAsnWrite);
7836         sip = SeqLocId (sfp->location);
7837         if (sip != NULL) {
7838           bsp = BioseqFind (sip);
7839         } else {
7840           bsp = nbsp;
7841         }
7842         if (bsp != NULL) {
7843           gslp = SeqLocMerge (bsp, sfp->location, NULL, TRUE, FALSE, FALSE);
7844           if (gslp != NULL) {
7845             sfp->location = SeqLocFree (sfp->location);
7846             sfp->location = gslp;
7847             if (bsp->repr == Seq_repr_seg) {
7848               gslp = SegLocToPartsEx (bsp, sfp->location, TRUE);
7849               sfp->location = SeqLocFree (sfp->location);
7850               sfp->location = gslp;
7851               hasNulls = LocationHasNullsBetween (sfp->location);
7852               sfp->partial = (sfp->partial || hasNulls);
7853             }
7854             FreeAllFuzz (gslp);
7855           }
7856         }
7857       }
7858     }
7859     RemoveValueFromDefline ("gene", ttl);
7860     RemoveValueFromDefline ("gene_syn", ttl);
7861   }
7862 }
7863 
7864 extern SeqFeatPtr AddProteinFeatureFromDefline (SeqEntryPtr psep, CharPtr title)
7865 {
7866   CharPtr    activity = NULL;
7867   CharPtr    ec = NULL;
7868   CharPtr    prot_name = NULL;
7869   CharPtr    prot_desc = NULL;
7870   CharPtr    other_prot_desc = NULL, tmp_desc;
7871   ProtRefPtr prp;
7872   SeqFeatPtr sfp = NULL;
7873   
7874   if (psep == NULL)
7875   {
7876     return NULL;
7877   }
7878   
7879         /*-----------------------------------------*/
7880         /* Parse the function field from the title */
7881         /*-----------------------------------------*/
7882 
7883   activity = FindValueFromPairInDefline ("function", title);
7884 
7885         /*------------------------------------------*/
7886         /* Parse the EC_number field from the title */
7887         /*------------------------------------------*/
7888 
7889   ec = FindValueFromPairInDefline ("EC_number", title);
7890 
7891         /*---------------------------------*/
7892         /* Parse the protein and prot_desc */
7893         /* fields from the title.          */
7894         /*---------------------------------*/
7895 
7896   prot_name = FindValueFromPairInDefline ("protein", title);
7897 
7898         /*---------------------------------*/
7899         /* If we found a protein value ... */
7900         /*---------------------------------*/
7901   if (!StringHasNoText (prot_name))
7902   {
7903           /*----------------------------------------------*/
7904           /* ... search for a protein description, either */
7905           /*     in the prot field (seperated by a ';')   */
7906           /*     or in its own 'prot_desc' field.         */
7907           /*----------------------------------------------*/
7908 
7909     prot_desc = StringChr (prot_name, ';');
7910           if (prot_desc != NULL)
7911           {
7912                   *prot_desc = '\0';
7913                   prot_desc++;
7914                   /* ignore this description if empty */
7915                   if (StringHasNoText (prot_desc))
7916                   {
7917                     prot_desc = NULL;
7918                   }
7919                   else
7920                   {
7921                     prot_desc = StringSave (prot_desc);
7922                   }
7923           }
7924   }
7925         other_prot_desc = FindValueFromPairInDefline ("prot_desc", title);
7926         if (StringHasNoText (other_prot_desc))
7927         {
7928           other_prot_desc = MemFree (other_prot_desc);
7929         }
7930         else
7931         {
7932    if (prot_desc == NULL)
7933           {
7934             prot_desc = other_prot_desc;
7935             other_prot_desc = NULL;
7936           }
7937           else 
7938           {
7939       tmp_desc = (CharPtr) MemNew ((StringLen (prot_desc) + StringLen (other_prot_desc) + 3)
7940                                        * sizeof (Char));
7941             if (tmp_desc != NULL)
7942             {
7943               StringCpy (tmp_desc, prot_desc);
7944               StringCat (tmp_desc, ";");
7945             StringCat (tmp_desc, other_prot_desc);
7946               prot_desc = MemFree (prot_desc);
7947               other_prot_desc = MemFree (other_prot_desc);
7948               prot_desc = tmp_desc;
7949             }
7950           }
7951         }
7952         
7953         /*--------------------------------*/
7954         /* ... add the prot and prot_desc */
7955         /*     to the Seq Features.       */
7956         /*--------------------------------*/
7957 
7958         prp = CreateNewProtRef (prot_name, prot_desc, ec, activity);
7959         if (prp != NULL)
7960         {
7961                 sfp = CreateNewFeature (psep, NULL, SEQFEAT_PROT, NULL);
7962                 if (sfp != NULL)
7963                 {
7964                   sfp->data.value.ptrvalue = (Pointer) prp;
7965                   RemoveValueFromDefline ("protein", title);
7966                   RemoveValueFromDefline ("prot_desc", title);
7967                   RemoveValueFromDefline ("function", title);
7968                   RemoveValueFromDefline ("EC_number", title);
7969                 }
7970         }
7971   return sfp;
7972 }
7973 
7974 extern void 
7975 AddCodingRegionFieldsFromProteinTitle 
7976 (CdRegionPtr  crp,
7977  CharPtr      title, 
7978  CharPtr PNTR pcomment)
7979 {
7980   CharPtr comment, comment_loc, total_comment = NULL, tmp_comment;
7981   
7982   if (crp == NULL || StringHasNoText (title))
7983   {
7984     return;
7985   }
7986   
7987         /*---------------------*/
7988         /* Parse the ORF field */
7989         /*---------------------*/
7990   if (FindValuePairInDefLine ("orf", title, NULL) != NULL)
7991   {
7992     crp->orf = TRUE;
7993     RemoveValueFromDefline ("orf", title);
7994   }
7995 
7996   if (pcomment == NULL)
7997   {
7998     return;
7999   }
8000 
8001         /*-------------------------------*/
8002         /* Parse the comment/note fields */
8003         /*-------------------------------*/
8004   comment_loc = FindValuePairInDefLine ("comment", title, NULL);
8005   while (comment_loc != NULL)
8006   {
8007     comment = FindValueFromPairInDefline ("comment", comment_loc);
8008     if (!StringHasNoText (comment))
8009     {
8010       if (total_comment == NULL)
8011       {
8012         total_comment = comment;
8013         comment = NULL;
8014       }
8015       else
8016       {
8017         tmp_comment = (CharPtr) MemNew ((StringLen (total_comment) + StringLen (comment) + 3) * sizeof (Char));
8018         if (tmp_comment != NULL)
8019         {
8020           StringCpy (tmp_comment, total_comment);
8021           StringCat (tmp_comment, ";");
8022           StringCat (tmp_comment, comment);
8023           total_comment = MemFree (total_comment);
8024           total_comment = tmp_comment;
8025         }
8026       }
8027     }
8028     comment = MemFree (comment);
8029     RemoveValueFromDefline ("comment", title);
8030     comment_loc = FindValuePairInDefLine ("comment", title, NULL);
8031   }
8032   
8033   *pcomment = total_comment;
8034 }
8035 
8036 static void AutomaticMrnaProcess (SeqEntryPtr nucsep, SeqEntryPtr mrnasep, Boolean partial5, Boolean partial3)
8037 
8038 {
8039   CharPtr     mrna = NULL;
8040   CharPtr     comment = NULL;
8041   BioseqPtr   bsp;
8042   MolInfoPtr  mip;
8043   BioseqPtr   mrnabsp;
8044   BioseqPtr   nucbsp;
8045   SeqLocPtr   oldslp;
8046   RnaRefPtr   rrp;
8047   SeqFeatPtr  sfp;
8048   SeqIdPtr    sip;
8049   SeqLocPtr   slp;
8050   CharPtr     ttl;
8051   ValNodePtr  vnp;
8052 
8053   if (nucsep == NULL || mrnasep == NULL) return;
8054   if (IS_Bioseq (nucsep) && IS_Bioseq (mrnasep)) {
8055     nucbsp = (BioseqPtr) nucsep->data.ptrvalue;
8056     mrnabsp = (BioseqPtr) mrnasep->data.ptrvalue;
8057     if (nucbsp == NULL || mrnabsp == NULL) return;
8058     slp = AlignmRNA2genomic (nucbsp, mrnabsp);
8059     if (slp == NULL) return;
8060     sip = SeqLocId (slp);
8061     if (sip != NULL) {
8062       bsp = BioseqFind (sip);
8063       if (bsp != NULL) {
8064         if (bsp->repr == Seq_repr_seg) {
8065           oldslp = slp;
8066           slp = SegLocToParts (bsp, oldslp);
8067           FreeAllFuzz (slp);
8068           SeqLocFree (oldslp);
8069         }
8070       }
8071     }
8072     StripLocusFromSeqLoc (slp);
8073     ttl = NULL;
8074     vnp = ValNodeFindNext (mrnabsp->descr, NULL, Seq_descr_title);
8075     if (vnp != NULL) {
8076       ttl = (CharPtr) vnp->data.ptrvalue;
8077     }
8078     if (ttl != NULL) {
8079       AddGeneFeatureFromTitle (nucsep, ttl, slp);
8080     
8081       /* get mRNA name */
8082       mrna = FindValueFromPairInDefline ("mrna", ttl);
8083       RemoveValueFromDefline ("mrna", ttl);
8084       if (StringHasNoText (mrna))
8085       {
8086         mrna = MemFree (mrna);
8087         mrna = FindValueFromPairInDefline ("cdna", ttl);
8088         RemoveValueFromDefline ("cdna", ttl);
8089       }
8090     }
8091     rrp = RnaRefNew ();
8092     if (rrp != NULL) {
8093       rrp->type = 2;
8094       if (! StringHasNoText (mrna)) {
8095         rrp->ext.choice = 1;
8096         rrp->ext.value.ptrvalue = mrna;
8097         mrna = NULL;
8098       }
8099       sfp = CreateNewFeature (nucsep, NULL, SEQFEAT_RNA, NULL);
8100       if (sfp != NULL) {
8101         sfp->data.value.ptrvalue = (Pointer) rrp;
8102         sfp->location = SeqLocFree (sfp->location);
8103         sfp->location = AsnIoMemCopy ((Pointer) slp,
8104                                       (AsnReadFunc) SeqLocAsnRead,
8105                                       (AsnWriteFunc) SeqLocAsnWrite);
8106         SetSeqFeatProduct (sfp, mrnabsp);
8107         SetSeqLocPartial (sfp->location, partial5, partial3);
8108         sfp->partial = (sfp->partial || partial5 || partial3);
8109         if (ttl != NULL) {
8110           comment = FindValueFromPairInDefline ("comment", ttl);
8111           if (!StringHasNoText (comment)) {
8112             sfp->comment = comment;
8113           }
8114           else
8115           {
8116             comment = MemFree (comment);
8117           }
8118           RemoveValueFromDefline ("comment", ttl);
8119         }
8120       }
8121     }
8122     mrna = MemFree (mrna);
8123     SeqLocFree (slp);
8124     if (StringHasNoText (ttl)) {
8125       ValNodeExtract (&(mrnabsp->descr), Seq_descr_title);
8126     }
8127     mip = MolInfoNew ();
8128     if (mip != NULL) {
8129       mip->biomol = 3;
8130       if (partial5 && partial3) {
8131         mip->completeness = 5;
8132       } else if (partial5) {
8133         mip->completeness = 3;
8134       } else if (partial3) {
8135         mip->completeness = 4;
8136       }
8137       vnp = CreateNewDescriptor (mrnasep, Seq_descr_molinfo);
8138       if (vnp != NULL) {
8139         vnp->data.ptrvalue = (Pointer) mip;
8140       }
8141     }
8142     mrnabsp->mol = Seq_mol_rna;
8143   }
8144 }
8145 
8146 static CharPtr LookForValueInBioseq (SeqEntryPtr sep, Uint1 mol, CharPtr valname)
8147 {
8148   BioseqPtr   bsp;
8149   CharPtr     title;
8150   ValNodePtr  vnp;
8151 
8152   if (sep == NULL || StringHasNoText (valname)) return FALSE;
8153   if (! IS_Bioseq (sep)) return FALSE;
8154   bsp = (BioseqPtr) sep->data.ptrvalue;
8155   if (bsp == NULL || bsp->mol != mol || bsp->descr == NULL) return FALSE;
8156   vnp = ValNodeFindNext (bsp->descr, NULL, Seq_descr_title);
8157   if (vnp == NULL || vnp->data.ptrvalue == NULL) return FALSE;
8158   title = (CharPtr) vnp->data.ptrvalue;
8159   return FindValueFromPairInDefline (valname, title);
8160 }
8161 
8162 static void FindBioseqWithValue (SeqEntryPtr sep, Uint1 mol, CharPtr valname, CharPtr value, SeqEntryPtr PNTR rsult)
8163 {
8164   BioseqPtr     bsp = NULL;
8165   BioseqSetPtr  bssp = NULL;
8166   CharPtr       match_value;
8167 
8168   if (sep == NULL || sep->data.ptrvalue == NULL || rsult == NULL) return;
8169   if (IS_Bioseq (sep)) {
8170     bsp = (BioseqPtr) sep->data.ptrvalue;
8171     match_value = LookForValueInBioseq (sep, mol, valname);
8172     if (StringICmp (match_value, value))
8173     {
8174       *rsult = sep;
8175     }
8176     match_value = MemFree (match_value);
8177   } else if (IS_Bioseq_set (sep)) {
8178     bssp = (BioseqSetPtr) sep->data.ptrvalue;
8179     for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
8180       FindBioseqWithValue (sep, mol, valname, value, rsult);
8181     }
8182   }
8183 }
8184 
8185 static void RemoveValueFromBioseq (SeqEntryPtr sep, CharPtr valname)
8186 {
8187   BioseqPtr   bsp;
8188   ValNodePtr  vnp;
8189 
8190   if (sep == NULL) return;
8191   if (! IS_Bioseq (sep)) return;
8192   bsp = (BioseqPtr) sep->data.ptrvalue;
8193   if (bsp == NULL || bsp->descr == NULL) return;
8194   vnp = SeqEntryGetSeqDescr (sep, Seq_descr_title, NULL);
8195   if (vnp == NULL) return;
8196   RemoveValueFromDefline (valname, vnp->data.ptrvalue);
8197   if (StringHasNoText (vnp->data.ptrvalue)) {
8198     ValNodeExtract (&(bsp->descr), Seq_descr_title);
8199   }  
8200 }
8201 
8202 static SeqEntryPtr FindRnaByRefOnRna (SeqEntryPtr sep, SeqEntryPtr psep)
8203 
8204 {
8205   SeqEntryPtr  msep;
8206   CharPtr      prot_name;
8207 
8208   msep = NULL;
8209   if (sep == NULL || psep == NULL) return NULL;
8210   prot_name = LookForValueInBioseq (psep, Seq_mol_aa, "prot");
8211   if (!StringHasNoText (prot_name))
8212   {
8213     FindBioseqWithValue (sep, Seq_mol_rna, "prot", prot_name, &msep);
8214     RemoveValueFromBioseq (msep, "prot");
8215   }
8216   prot_name = MemFree (prot_name);
8217   return msep;
8218 }
8219 
8220 static void FindRnaByName (SeqEntryPtr sep, CharPtr str, SeqEntryPtr PNTR msep)
8221 
8222 {
8223   BioseqPtr     bsp = NULL;
8224   BioseqSetPtr  bssp = NULL;
8225   RnaRefPtr     rrp;
8226   SeqAnnotPtr   sap;
8227   SeqFeatPtr    sfp;
8228 
8229   if (sep == NULL || sep->data.ptrvalue == NULL) return;
8230   if (str == NULL || msep == NULL) return;
8231   if (IS_Bioseq (sep)) {
8232     bsp = (BioseqPtr) sep->data.ptrvalue;
8233     sap = bsp->annot;
8234   } else if (IS_Bioseq_set (sep)) {
8235     bssp = (BioseqSetPtr) sep->data.ptrvalue;
8236     sap = bssp->annot;
8237   } else return;
8238   while (sap != NULL) {
8239     if (sap->type == 1) {
8240       sfp = (SeqFeatPtr) sap->data;
8241       while (sfp != NULL) {
8242         if (sfp->data.choice == SEQFEAT_RNA) {
8243           rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
8244           if (rrp != NULL && rrp->type == 2 && rrp->ext.choice == 1 && sfp->product != NULL) {
8245             if (StringICmp (rrp->ext.value.ptrvalue, str) == 0) {
8246               bsp = BioseqFind (SeqLocId (sfp->product));
8247               if (bsp != NULL) {
8248                 *msep = SeqMgrGetSeqEntryForData (bsp);
8249               }
8250             }
8251           }
8252         }
8253         sfp = sfp->next;
8254       }
8255     }
8256     sap = sap->next;
8257   }
8258   if (bssp != NULL) {
8259     for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
8260       FindRnaByName (sep, str, msep);
8261     }
8262   }
8263 }
8264 
8265 static SeqEntryPtr FindRnaByRefOnProtein (SeqEntryPtr sep, SeqEntryPtr psep)
8266 
8267 {
8268   SeqEntryPtr  msep;
8269   CharPtr      mrna_name;
8270 
8271   msep = NULL;
8272   if (sep == NULL || psep == NULL) return NULL;
8273   mrna_name = LookForValueInBioseq (psep, Seq_mol_aa, "mrna");
8274   if (!StringHasNoText (mrna_name))
8275   {
8276     FindRnaByName (sep, mrna_name, &msep);
8277     RemoveValueFromBioseq (msep, "mrna");
8278   }
8279   mrna_name = MemFree (mrna_name);
8280   return msep;
8281 }
8282 
8283 static void FindRnaByLocationOverlap (SeqEntryPtr sep, SeqLocPtr slp,
8284                                       Int4Ptr mindiff, SeqEntryPtr PNTR msep)
8285 
8286 {
8287   BioseqPtr     bsp = NULL;
8288   BioseqSetPtr  bssp = NULL;
8289   Int4          diff;
8290   RnaRefPtr     rrp;
8291   SeqAnnotPtr   sap;
8292   SeqFeatPtr    sfp;
8293 
8294   if (sep == NULL || sep->data.ptrvalue == NULL) return;
8295   if (slp == NULL || mindiff == NULL || msep == NULL) return;
8296   if (IS_Bioseq (sep)) {
8297     bsp = (BioseqPtr) sep->data.ptrvalue;
8298     sap = bsp->annot;
8299   } else if (IS_Bioseq_set (sep)) {
8300     bssp = (BioseqSetPtr) sep->data.ptrvalue;
8301     sap = bssp->annot;
8302   } else return;
8303   while (sap != NULL) {
8304     if (sap->type == 1) {
8305       sfp = (SeqFeatPtr) sap->data;
8306       while (sfp != NULL) {
8307         if (sfp->data.choice == SEQFEAT_RNA) {
8308           rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
8309           if (rrp != NULL && rrp->type == 2 && sfp->product != NULL) {
8310             diff = SeqLocAinB (slp, sfp->location);
8311             if (diff >= 0) {
8312               if (diff < *mindiff) {
8313                 bsp = BioseqFind (SeqLocId (sfp->product));
8314                 if (bsp != NULL) {
8315                   *mindiff = diff;
8316                   *msep = SeqMgrGetSeqEntryForData (bsp);
8317                 }
8318               }
8319             }
8320           }
8321         }
8322         sfp = sfp->next;
8323       }
8324     }
8325     sap = sap->next;
8326   }
8327   if (bssp != NULL) {
8328     for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
8329       FindRnaByLocationOverlap (sep, slp, mindiff, msep);
8330     }
8331   }
8332 }
8333 
8334 static void FuseNucProtBiosources (SeqEntryPtr sep)
8335 
8336 {
8337   BioSourcePtr  biop1, biop2;
8338   BioseqPtr     bsp;
8339   BioseqSetPtr  bssp;
8340   ValNodePtr    PNTR prev;
8341   ValNodePtr    sdp1, sdp2;
8342   SeqEntryPtr   tmp;
8343 
8344   if (sep == NULL) return;
8345   if (! IS_Bioseq_set (sep)) return;
8346   bssp = (BioseqSetPtr) sep->data.ptrvalue;
8347   if (bssp == NULL || bssp->_class != BioseqseqSet_class_nuc_prot) return;
8348   tmp = FindNucSeqEntry (sep);
8349   if (tmp == NULL) return;
8350   if (! IS_Bioseq (tmp)) return;
8351   bsp = (BioseqPtr) tmp->data.ptrvalue;
8352   if (bsp == NULL) return;
8353   prev = &(bssp->descr);
8354   sdp1 = bssp->descr;
8355   while (sdp1 != NULL && sdp1->choice != Seq_descr_source) {
8356     prev = &(sdp1->next);
8357     sdp1 = sdp1->next;
8358   }
8359   if (sdp1 == NULL) return;
8360   sdp2 = SeqEntryGetSeqDescr (tmp, Seq_descr_source, NULL);
8361   if (sdp2 == NULL) return;
8362   biop1 = (BioSourcePtr) sdp1->data.ptrvalue;
8363   biop2 = (BioSourcePtr) sdp2->data.ptrvalue;
8364   if (CmpOrgById (biop1, biop2)) {
8365     *prev = sdp1->next;
8366     sdp1->next = NULL;
8367     SeqDescrFree (sdp1);
8368   }
8369 }
8370 
8371 static void AssignOneProtein 
8372 (SeqEntryPtr      prot_sep, 
8373  SequencesFormPtr sqfp,
8374  SeqEntryPtr      assign_sep,
8375  SeqLocPtr        use_this,
8376  BioseqPtr        nucbsp,
8377  Int2             code,
8378  Boolean          makeMRNA)
8379 {
8380   MolInfoPtr        mip;
8381   SeqEntryPtr       msep = NULL;
8382   BioseqPtr         protbsp;
8383   SeqLocPtr         slp;
8384   Int4              mindiff;
8385   Boolean           partialN;
8386   Boolean           partialC;
8387   ValNodePtr        vnp;
8388   
8389   if (prot_sep == NULL)
8390   {
8391     return;
8392   }
8393   
8394   mip = MolInfoNew ();
8395   if (mip != NULL) {
8396     mip->biomol = 8;
8397     if (sqfp == NULL) {
8398       /* no technique */
8399     } else if (GetStatus (sqfp->protTechBoth)) {
8400       mip->tech = 10;
8401     } else {
8402       mip->tech = 13;
8403     }
8404     if (sqfp == NULL) {
8405       if (use_this == NULL) {
8406         partialN = FALSE;
8407         partialC = FALSE;
8408       } else {
8409         CheckSeqLocForPartial (use_this, &partialN, &partialC);
8410       }
8411     } else {
8412       partialN = GetStatus (sqfp->partialN);
8413       partialC = GetStatus (sqfp->partialC);
8414     }
8415     if (partialN && partialC) {
8416       mip->completeness = 5;
8417     } else if (partialN) {
8418       mip->completeness = 3;
8419     } else if (partialC) {
8420       mip->completeness = 4;
8421     }
8422     vnp = CreateNewDescriptor (prot_sep, Seq_descr_molinfo);
8423     if (vnp != NULL) {
8424       vnp->data.ptrvalue = (Pointer) mip;
8425     }
8426   }
8427   if (assign_sep != NULL) {
8428     if (sqfp != NULL && sqfp->seqPackage == SEQ_PKG_GENOMICCDNA) {
8429       ClearBatchSuggestNucleotide ();
8430       msep = FindRnaByRefOnProtein (assign_sep, prot_sep);
8431       if (msep == NULL) {
8432         msep = FindRnaByRefOnRna (assign_sep, prot_sep);
8433       }
8434       if (msep == NULL && nucbsp != NULL && IS_Bioseq (prot_sep)) {
8435         protbsp = (BioseqPtr) prot_sep->data.ptrvalue;
8436         if (protbsp != NULL) {
8437           slp = PredictCodingRegion (nucbsp, protbsp, code);
8438           if (slp != NULL) {
8439             mindiff = INT4_MAX;
8440             FindRnaByLocationOverlap (assign_sep, slp, &mindiff, &msep);
8441           }
8442           SeqLocFree (slp);
8443         }
8444       }
8445     }
8446     if (msep != NULL) {
8447       msep = GetBestTopParentForDataEx (ObjMgrGetEntityIDForChoice (msep),
8448                                         (BioseqPtr) msep->data.ptrvalue, TRUE);
8449     }
8450     if (msep == NULL) {
8451       msep = assign_sep;
8452       if (IS_Bioseq (msep))
8453       {
8454         msep = GetBestTopParentForDataEx (ObjMgrGetEntityIDForChoice (msep),
8455                                           (BioseqPtr) msep->data.ptrvalue, TRUE);
8456       }
8457     }
8458     AddSeqEntryToSeqEntry (msep, prot_sep, TRUE);
8459     AutomaticProteinProcess (msep, prot_sep, code, makeMRNA, use_this);
8460   } else {
8461     AutomaticProteinProcess (assign_sep, prot_sep, code, makeMRNA, use_this);
8462   }  
8463 }
8464 
8465 static SeqEntryPtr FindSeqEntryWithTranscriptID (SeqEntryPtr sep, CharPtr transcript_id)
8466 {
8467   SeqEntryPtr  found_sep = NULL;
8468   BioseqPtr    nbsp;
8469   SeqIdPtr     sip, sip_next;
8470   CharPtr      tmp;
8471   BioseqSetPtr bssp;
8472   
8473   if (IS_Bioseq (sep))
8474   {
8475     nbsp = sep->data.ptrvalue;
8476     for (sip = nbsp->id; sip != NULL && found_sep == NULL; sip = sip_next)
8477     {
8478       sip_next = sip->next;
8479       sip->next = NULL;
8480       tmp = SeqIdWholeLabel (sip, PRINTID_REPORT);
8481       sip->next = sip_next;
8482       if (StringCmp (tmp, transcript_id) == 0)
8483       {
8484         found_sep = sep;
8485       }
8486       tmp = MemFree (tmp);
8487     }
8488   }
8489   else
8490   {
8491     bssp = (BioseqSetPtr) sep->data.ptrvalue;
8492     for (sep = bssp->seq_set; sep != NULL && found_sep == NULL; sep = sep->next)
8493     {
8494       found_sep = FindSeqEntryWithTranscriptID (sep, transcript_id);
8495     }
8496   }
8497   return found_sep; 
8498 }
8499 
8500 /* This section of code is used for matching up proteins to coding region locations
8501  * on the nucleotide sequences.
8502  */
8503 
8504 /* A ValNode list will be used to hold the list of pairings between protein and nucleotide
8505  * sequences.  There will be one ValNode per protein sequence.  The choice for the ValNode
8506  * indicates the position of the nucleotide sequence in the set plus one - a zero indicates
8507  * that there is no nucleotide for this protein.  The data.ptrvalue will be used to hold the
8508  * location of the coding region on the nucleotide.
8509  */
8510 
8511 /* This function frees the AssociationList. */ 
8512 extern NucProtAssocPtr FreeAssociationList (NucProtAssocPtr assoc_list)
8513 {
8514   if (assoc_list == NULL)
8515   {
8516     return NULL;
8517   }
8518   assoc_list->next = FreeAssociationList (assoc_list->next);
8519   assoc_list->loc = SeqLocFree (assoc_list->loc);
8520   assoc_list = MemFree (assoc_list);
8521   return assoc_list;
8522 }
8523 
8524 static NucProtAssocPtr NewAssociationList (NucProtAssocPtr PNTR assoc_list, Int4 position, SeqLocPtr loc)
8525 {
8526   NucProtAssocPtr last = NULL;
8527   NucProtAssocPtr new_assoc = (NucProtAssocPtr) MemNew (sizeof (NucProtAssocData));
8528   
8529   if (assoc_list == NULL) {
8530     return NULL;
8531   }
8532   if (new_assoc != NULL) {
8533     new_assoc->position = position;
8534     new_assoc->loc = loc;
8535     new_assoc->next = NULL;
8536     if (*assoc_list == NULL) {
8537       *assoc_list = new_assoc;
8538     } else {
8539       last = *assoc_list;
8540       while (last->next != NULL) {
8541         last = last->next;
8542       }
8543       last->next = new_assoc;
8544     }
8545   }
8546   return *assoc_list; 
8547 }
8548 
8549 /* This function copies the AssociationList */
8550 static NucProtAssocPtr CopyAssociationList (NucProtAssocPtr orig_assoc_list)
8551 {
8552   NucProtAssocPtr copy_assoc_list = NULL;
8553   
8554   if (orig_assoc_list == NULL)
8555   {
8556     return NULL;
8557   }
8558   copy_assoc_list = (NucProtAssocPtr) MemNew (sizeof (NucProtAssocData));
8559   if (copy_assoc_list != NULL)
8560   {
8561     copy_assoc_list->position = orig_assoc_list->position;
8562     copy_assoc_list->loc = SeqLocCopy (orig_assoc_list->loc);
8563     copy_assoc_list->next = CopyAssociationList (orig_assoc_list->next);
8564   }
8565   
8566   return copy_assoc_list;
8567 }
8568 
8569 
8570 /* This function determines whether all proteins have been assigned to 
8571  * nucleotide sequences. 
8572  */
8573 static Boolean AllLocationsProvided (NucProtAssocPtr vnp)
8574 {
8575   if (vnp == NULL)
8576   {
8577     return FALSE;
8578   }
8579   while (vnp != NULL)
8580   {
8581     if (vnp->position == 0)
8582     {
8583       return FALSE;
8584     }
8585     vnp = vnp->next;
8586   }
8587   return TRUE;
8588 }
8589 
8590 /* This function determines whether any proteins have been assigned to 
8591  * nucleotide sequences. 
8592  */
8593 static Boolean AnyLocationsProvided (NucProtAssocPtr vnp)
8594 {
8595   if (vnp == NULL)
8596   {
8597     return FALSE;
8598   }
8599   while (vnp != NULL)
8600   {
8601     if (vnp->position != 0)
8602     {
8603       return TRUE;
8604     }
8605     vnp = vnp->next;
8606   }
8607   return FALSE;
8608 }
8609 
8610 /* Given a nucleotide-protein pair, this function calculates a coding region location
8611  * using Suggest Intervals.  If no location is found, a location that includes the
8612  * entire sequence is returned instead.
8613  */
8614 static SeqLocPtr DefaultPairInterval (BioseqPtr nbsp, BioseqPtr pbsp, Int2 code)
8615 {
8616   SeqLocPtr slp;
8617   ErrSev    oldsev;
8618   Char      prot_str[3];
8619   Boolean   partial5 = FALSE, partial3 = FALSE;
8620   
8621   if (nbsp == NULL || pbsp == NULL)
8622   {
8623     return NULL;
8624   }
8625     
8626   /* need to suppress errors */  
8627   oldsev = ErrSetMessageLevel (SEV_MAX);
8628 
8629   /* try to get location using SuggestIntervals */
8630   SetBatchSuggestNucleotide (nbsp, code);
8631   slp = PredictCodingRegion (nbsp, pbsp, code);
8632   ClearBatchSuggestNucleotide ();  
8633   
8634   ErrSetMessageLevel (oldsev);
8635   
8636   /* if no location, use entire sequence */
8637   if (slp == NULL)
8638   {
8639     slp = SeqLocIntNew (0, nbsp->length - 1, Seq_strand_plus, nbsp->id); 
8640   }
8641 
8642   /* check for start and stop codons */  
8643   SeqPortStreamInt (pbsp, 0, 1, Seq_strand_plus, EXPAND_GAPS_TO_DASHES, (Pointer) (prot_str), NULL);
8644   if (prot_str[0] != 'M') {
8645     partial5 = TRUE;
8646   }
8647   
8648   if (SeqLocLen (slp) / 3 != pbsp->length + 1) {
8649     partial3 = TRUE;
8650   }
8651   
8652   SetSeqLocPartial (slp, partial5, partial3);
8653   
8654   return slp;
8655 }
8656 
8657 
8658 static Boolean FindFeaturesInIdenticalRegions (NucProtAssocPtr assoc_list)
8659 {
8660   Char       path [PATH_MAX];
8661   FILE       *fp;
8662   NucProtAssocPtr   vnp;
8663   SeqFeatPtr sfp;
8664   SeqMgrFeatContext fcontext;  
8665   Char        id_txt [128];
8666   Boolean     found_any = FALSE;
8667 
8668   if (assoc_list == NULL)
8669   {
8670     return FALSE;
8671   }
8672   
8673   TmpNam (path);
8674   fp = FileOpen (path, "wb");
8675   
8676   for (vnp = assoc_list; vnp != NULL; vnp = vnp->next) {
8677     if (vnp->loc != NULL) {
8678       sfp = SeqMgrGetOverlappingCDS (vnp->loc, &fcontext);
8679       if (sfp != NULL && SeqLocCompare (vnp->loc, sfp->location) == SLC_A_EQ_B) {
8680         if (fp == NULL) {
8681           return TRUE;
8682         } else {
8683           found_any = TRUE;
8684           SeqIdWrite (SeqLocId (vnp->loc), id_txt, PRINTID_REPORT, sizeof (id_txt) - 1);          
8685           fprintf (fp, "%s\n", id_txt);
8686         }
8687       }
8688     }
8689   }
8690   FileClose (fp);
8691 
8692   if (found_any) {
8693     LaunchGeneralTextViewer (path, "Sequences with pre-existing Coding Regions");
8694   }
8695   FileRemove (path);   
8696   return found_any;
8697 }
8698 
8699 
8700 static Int2 GetGeneticCodeFromBioseq (BioseqPtr bsp)
8701 {
8702   Int2 code = 1;
8703   SeqDescrPtr sdp;
8704   SeqMgrDescContext context;
8705   CharPtr location;
8706 
8707   if (bsp != NULL) {
8708     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_title, &context);
8709     if (sdp != NULL) {
8710       location = FindValueFromPairInDeflineBeforeCharPtr ("location", sdp->data.ptrvalue, NULL);
8711       if (!StringHasNoText (location)) {      
8712         code = UseGeneticCodeForLocation (location);
8713       }
8714     }
8715   }
8716   return code;
8717 }
8718 
8719 
8720 /* This function takes a ValNode list where each ValNode represents
8721  * a protein in prot_list (in order).  The choice for each ValNode
8722  * represents the position of the chosen nucleotide in the nuc_list
8723  * (position includes segments in segmented sets, which is why 
8724  * FindNthSequenceInSet is used) plus one - zero indicates that there
8725  * is no nucleotide sequence for this protein.
8726  * The data.ptrvalue for the ValNode is to be populated with a
8727  * coding region SeqLoc, or NULL if there is no nucleotide for the protein.
8728  */
8729 static Boolean 
8730 PickCodingRegionLocationsForProteinNucleotidePairs 
8731 (NucProtAssocPtr  assoc_list,
8732  SeqEntryPtr nuc_list,
8733  SeqEntryPtr prot_list)
8734 {
8735   NucProtAssocPtr vnp_assoc;
8736   Int4       data_row;
8737   BioseqPtr  nbsp, pbsp;
8738   SeqLocPtr  slp;
8739   Char       path [PATH_MAX];
8740   FILE       *fp;
8741   Boolean    errors_found = FALSE;
8742   Char       n_idstr[128];
8743   Char       p_idstr[128];
8744   Int2       code;
8745   
8746   if (assoc_list == NULL || nuc_list == NULL || prot_list == NULL)
8747   {
8748     return FALSE;
8749   }
8750 
8751   TmpNam (path);
8752   fp = FileOpen (path, "wb");
8753 
8754   vnp_assoc = assoc_list;
8755   for (data_row = 0, vnp_assoc = assoc_list;
8756        vnp_assoc != NULL; 
8757        data_row++, vnp_assoc = vnp_assoc->next)
8758   {
8759     if (vnp_assoc->position > 0)
8760     {
8761       nbsp = FindNthSequenceInSet (nuc_list, vnp_assoc->position - 1, NULL, TRUE);
8762       pbsp = FindNthSequenceInSet (prot_list, data_row, NULL, FALSE);
8763       if (nbsp == NULL || pbsp == NULL) {
8764         slp = NULL;
8765       } else if ((nbsp->length +1) / 3 < pbsp->length) {
8766         if (fp != NULL) {
8767           SeqIdWrite (SeqIdFindWorst (nbsp->id), n_idstr, PRINTID_REPORT,
8768                       sizeof (n_idstr));
8769           SeqIdWrite (SeqIdFindWorst (pbsp->id), p_idstr, PRINTID_REPORT,
8770                       sizeof (p_idstr));
8771           fprintf (fp, "%s is too short to encode %s\n", n_idstr, p_idstr);
8772         }
8773         vnp_assoc->position = 0;
8774         errors_found = TRUE;
8775         slp = NULL;
8776       } else {
8777         code = GetGeneticCodeFromBioseq (nbsp);
8778         slp = DefaultPairInterval (nbsp, pbsp, code);
8779       }
8780     }
8781     else
8782     {
8783       slp = NULL;
8784     }
8785     vnp_assoc->loc = SeqLocFree (vnp_assoc->loc);
8786     vnp_assoc->loc = slp;
8787   }
8788   
8789   FileClose (fp);
8790   if (errors_found) {
8791     LaunchGeneralTextViewer (path, "Nucleotide-Protein Mismatches");
8792   }
8793   FileRemove (path);
8794   return !errors_found;
8795 }
8796 
8797 
8798 static Int2 FindGeneticCodeForBioseq (BioseqPtr bsp, Int2 default_code)
8799 {
8800   Int2         code = default_code;
8801   BioSourcePtr biop;
8802   SeqEntryPtr  nsep;
8803   BioseqSetPtr bssp;
8804   SeqDescrPtr  sdp = NULL;
8805 
8806   if (bsp == NULL) return default_code;
8807   nsep = GetBestTopParentForData (ObjMgrGetEntityIDForPointer (bsp), bsp);
8808   if (nsep == NULL || nsep->data.ptrvalue == NULL) return default_code;
8809   if (nsep->choice == 1)
8810   {
8811     bsp = nsep->data.ptrvalue;
8812     sdp = bsp->descr;
8813   }
8814   else if (nsep->choice == 2)
8815   {
8816     bssp = nsep->data.ptrvalue;
8817     sdp = bssp->descr;
8818   }
8819   while (sdp != NULL)
8820   {
8821     if (sdp->choice == Seq_descr_source && sdp->data.ptrvalue != NULL)
8822     {
8823       biop = (BioSourcePtr) sdp->data.ptrvalue;
8824       if (biop->org != NULL && biop->org->orgname != NULL)
8825       {
8826         code = BioSourceToGeneticCode (biop);
8827       }
8828     }
8829     sdp = sdp->next;
8830   }
8831   return code;
8832 }
8833 
8834 
8835 /* This function takes a ValNode list of coding region SeqLocs,
8836  * the list of nucleotide sequences, and the list of protein sequences
8837  * and creates the nuc-prot sets.
8838  */
8839 static void 
8840 AssignProteinsToSelectedNucleotides 
8841 (NucProtAssocPtr  assoc_list,
8842  SeqEntryPtr      nuc_list,
8843  SeqEntryPtr      prot_list,
8844  SequencesFormPtr sqfp, 
8845  Int2             code,
8846  Boolean          makeMRNA)
8847 {
8848   SeqEntryPtr prot_sep, nsep, prot_next;
8849   NucProtAssocPtr  vnp_assoc;
8850   BioseqPtr   nbsp;
8851   BioseqPtr PNTR bsp_array;
8852   Int4           prot_num;
8853   ValNodePtr     descr = NULL;
8854   Int2           genCode;
8855 
8856   if (assoc_list == NULL || nuc_list == NULL || prot_list == NULL)
8857   {
8858     return;
8859   }
8860 
8861   /* need to collect bioseqs before we start adding, otherwise the position in
8862    * the set changes */
8863   
8864   bsp_array = (BioseqPtr PNTR) MemNew (ValNodeLen (prot_list) * sizeof (BioseqPtr));
8865   if (bsp_array == NULL)
8866   {
8867     return;
8868   }
8869   
8870   for (prot_num = 0, vnp_assoc = assoc_list;
8871        vnp_assoc != NULL;
8872        prot_num++, vnp_assoc = vnp_assoc->next)
8873   {
8874     if (vnp_assoc->loc == NULL)
8875     {
8876       bsp_array [prot_num] = NULL;
8877     }
8878     else
8879     {
8880       bsp_array [prot_num] = FindNthSequenceInSet (nuc_list, vnp_assoc->position - 1, NULL, TRUE);
8881     }
8882   }
8883   
8884   for (prot_sep = prot_list, vnp_assoc = assoc_list, prot_num = 0;
8885        prot_sep != NULL && vnp_assoc != NULL;
8886        prot_sep = prot_next, vnp_assoc = vnp_assoc->next, prot_num++)
8887   {
8888     prot_next = prot_sep->next;
8889     prot_sep->next = NULL;
8890     
8891     if (vnp_assoc->loc == NULL)
8892     {
8893       /* discard protein */
8894       if (IS_Bioseq (prot_sep))
8895       {
8896         SeqMgrDeleteFromBioseqIndex (prot_sep->data.ptrvalue);
8897       }
8898       prot_sep = SeqEntryFree (prot_sep);
8899     }
8900     else
8901     {
8902       nbsp = bsp_array [prot_num];
8903       nsep = SeqMgrGetSeqEntryForData (nbsp);
8904       if (nbsp != NULL && nbsp->repr == Seq_repr_seg)
8905       {
8906         nsep = GetBestTopParentForData (ObjMgrGetEntityIDForPointer (nbsp), nbsp); 
8907       }
8908       genCode = FindGeneticCodeForBioseq (nbsp, code);
8909       if (nsep != NULL && nsep->data.ptrvalue == nbsp) {
8910         descr = ExtractBioSourceAndPubs (nsep);
8911       }
8912       AssignOneProtein (prot_sep, sqfp, nsep, vnp_assoc->loc, nbsp, 
8913                         genCode, makeMRNA);
8914       if (descr != NULL) {
8915         ReplaceBioSourceAndPubs (nsep, descr);
8916       }
8917       vnp_assoc->loc = NULL; /*SeqLoc was freed in AssignOneProtein */
8918     }
8919   }
8920   
8921   bsp_array = MemFree (bsp_array);
8922 }
8923 
8924 /* This function creates a new protein ID based on the nucleotide ID that will be
8925  * unique within the record - nucleotide and protein sequence IDs are checked
8926  * for matches.
8927  */
8928 static CharPtr 
8929 BuildProteinIDUniqueInIDAndTitleEdit 
8930 (CharPtr nuc_id,
8931  IDAndTitleEditPtr iatep_nuc,
8932  IDAndTitleEditPtr iatep_prot)
8933 {
8934   CharPtr new_id, cp;
8935   Int4    offset, seq_num;
8936   Boolean unique_found = FALSE;
8937   
8938   if (iatep_nuc == NULL || iatep_prot == NULL || StringHasNoText (nuc_id))
8939   {
8940     return NULL;
8941   }
8942   
8943   new_id = (CharPtr) MemNew ((StringLen (nuc_id) + 20) * sizeof (Char));
8944   if (new_id != NULL)
8945   {
8946     StringCpy (new_id, nuc_id);
8947     StringCat (new_id, "_");
8948     cp = new_id + StringLen (new_id);
8949     for (offset = 1; offset < INT4_MAX && ! unique_found; offset ++)
8950     {
8951       sprintf (cp, "%d", offset);
8952       unique_found = TRUE;
8953       for (seq_num = 0; seq_num < iatep_nuc->num_sequences && unique_found; seq_num++)
8954       {
8955         if (StringCmp (iatep_nuc->id_list [seq_num], new_id) == 0)
8956         {
8957           unique_found = FALSE;
8958         }
8959       }
8960       for (seq_num = 0; seq_num < iatep_prot->num_sequences && unique_found; seq_num++)
8961       {
8962         if (StringCmp (iatep_prot->id_list [seq_num], new_id) == 0)
8963         {
8964           unique_found = FALSE;
8965         }
8966       }
8967     }
8968   }
8969   if (unique_found)
8970   {
8971     return new_id;
8972   }
8973   else
8974   {
8975     new_id = MemFree (new_id);
8976     return StringSave ("too_many");
8977   }
8978 }
8979 
8980 static Boolean DoIdsMatch (CharPtr id1, CharPtr id2)
8981 {
8982   CharPtr tmp1, cp1 = NULL;
8983   CharPtr tmp2, cp2 = NULL;
8984   Boolean match = FALSE;
8985   
8986   tmp1 = StringChr (id1, '|');
8987   if (tmp1 == NULL) {
8988     tmp1 = id1;
8989   } else if (tmp1 == id1 + 2) {
8990     tmp1++;
8991     cp1 = StringChr (tmp1, '|');
8992     if (cp1 != NULL) {
8993       *cp1 = 0;
8994     }
8995   }
8996   
8997   tmp2 = StringChr (id2, '|');
8998   if (tmp2 == NULL) {
8999     tmp2 = id2;
9000   } else if (tmp2 == id2 + 2) {
9001     tmp2++;
9002     cp2 = StringChr (tmp2, '|');
9003     if (cp2 != NULL) {
9004       *cp2 = 0;
9005     }
9006   }
9007  
9008   if (StringCmp (tmp1, tmp2) == 0)
9009   {
9010     match = TRUE;
9011   }
9012   if (cp1 != NULL) {
9013     *cp1 = '|';
9014   }
9015   if (cp2 != NULL) {
9016     *cp2 = '|';
9017   }
9018   return match;
9019 }
9020 
9021 
9022 /* if the user gave the protein sequences the same IDs as the nucleotide sequences,
9023  * we need to create new sequence IDs for the proteins so that they will be unique.
9024  * We should also make sure that sequence IDs that don't match nucleotide sequence
9025  * IDs are unique.
9026  */
9027 static void ReplaceDuplicateProteinIDs (SeqEntryPtr nuc_list, SeqEntryPtr prot_list)
9028 {
9029   Int4              nuc_seq_num, prot_seq_num, prot_seq_num_check;
9030   IDAndTitleEditPtr iatep_nuc, iatep_prot;
9031   Boolean           found_nuc_match;
9032   CharPtr           tmp_str, cp;
9033   BioseqPtr         prot_bsp, nuc_bsp;
9034   
9035   if (nuc_list == NULL || prot_list == NULL)
9036   {
9037     return;
9038   }
9039   
9040   iatep_nuc = SeqEntryListToIDAndTitleEditEx (nuc_list, TRUE);
9041   iatep_prot = SeqEntryListToIDAndTitleEdit (prot_list);
9042   if (iatep_nuc != NULL && iatep_prot != NULL)
9043   {
9044     for (prot_seq_num = 0; prot_seq_num < iatep_prot->num_sequences; prot_seq_num++)
9045     {
9046       /* This part replaces any protein sequence IDs that match a nucleotide ID with
9047        * the nucleotide ID plus an underscore plus a number that makes the ID
9048        * unique.
9049        */
9050       found_nuc_match = FALSE;
9051       prot_bsp = FindNthSequenceInSet (prot_list, prot_seq_num, &(iatep_prot->is_seg[prot_seq_num]), FALSE);
9052       if (prot_bsp == NULL) continue;
9053       for (nuc_seq_num = 0;
9054            nuc_seq_num < iatep_nuc->num_sequences && ! found_nuc_match; 
9055            nuc_seq_num++)
9056       {
9057         nuc_bsp = FindNthSequenceInSet (nuc_list, nuc_seq_num, &(iatep_nuc->is_seg[prot_seq_num]), TRUE);
9058         if (nuc_bsp == NULL) continue;
9059       
9060         if (SeqIdIn (prot_bsp->id, nuc_bsp->id) || RelaxedSeqIdIn (prot_bsp->id, nuc_bsp->id)
9061             || DoIdsMatch (iatep_nuc->id_list [nuc_seq_num],
9062                            iatep_prot->id_list [prot_seq_num])) {
9063           tmp_str = iatep_nuc->id_list [nuc_seq_num];
9064           cp = StringChr (tmp_str, '|');
9065           if (cp == tmp_str + 2) {
9066             tmp_str += 3;
9067             cp = StringChr (tmp_str, '|');
9068           }
9069           if (cp != NULL) {
9070             *cp = 0;
9071           }
9072                         
9073           iatep_prot->id_list [prot_seq_num] = MemFree (iatep_prot->id_list [prot_seq_num]);
9074           iatep_prot->id_list [prot_seq_num] = BuildProteinIDUniqueInIDAndTitleEdit (tmp_str,
9075                                                                                      iatep_nuc,
9076                                                                                      iatep_prot);
9077           if (cp != NULL) {
9078             *cp = '|';
9079           }
9080           found_nuc_match = TRUE;
9081         }
9082       }
9083       /* This part replaces a protein sequence ID that matches a previous protein
9084        * sequence ID with the original protein sequence ID plus an underscore plus
9085        * a number that makes the ID unique.
9086        */
9087       if (!found_nuc_match)
9088       {
9089         for (prot_seq_num_check = prot_seq_num + 1; 
9090              prot_seq_num_check < iatep_prot->num_sequences;
9091              prot_seq_num_check ++)
9092         {
9093           if (StringCmp (iatep_prot->id_list [prot_seq_num],
9094                          iatep_prot->id_list [prot_seq_num_check]) == 0)
9095           {
9096             tmp_str = iatep_prot->id_list [prot_seq_num_check];
9097             cp = StringChr (tmp_str, '|');
9098             if (cp == tmp_str + 2) {
9099               tmp_str += 3;
9100               cp = StringChr (tmp_str, '|');
9101             }
9102             if (cp != NULL) {
9103               *cp = 0;
9104             }
9105             tmp_str = StringSave (tmp_str);
9106           
9107             iatep_prot->id_list [prot_seq_num_check] = MemFree (iatep_prot->id_list [prot_seq_num_check]);
9108             iatep_prot->id_list [prot_seq_num_check] = BuildProteinIDUniqueInIDAndTitleEdit (tmp_str,
9109                                                                                              iatep_nuc,
9110                                                                                              iatep_prot);
9111             tmp_str = MemFree (tmp_str);
9112           }
9113         }
9114       }
9115     }
9116   }
9117   ApplyIDAndTitleEditToSeqEntryList (prot_list, iatep_prot);
9118   iatep_prot = IDAndTitleEditFree (iatep_prot);
9119   iatep_nuc = IDAndTitleEditFree (iatep_nuc);
9120 }
9121 
9122 static Uint2 nucprotedit_types [] = {
9123   TAGLIST_PROMPT, TAGLIST_PROMPT, TAGLIST_POPUP, TAGLIST_TEXT, TAGLIST_TEXT
9124 };
9125 
9126 static Uint2 nucprotedit_widths [] = {
9127   5, 20, 10, 15, 15
9128 };
9129 
9130 #define NUCPROTEDIT_NUCID_COLUMN 2
9131 #define NUCPROTEDIT_GENE_COLUMN  3
9132 #define NUCPROTEDIT_PROT_COLUMN  4
9133 
9134 typedef struct nucprotedit
9135 {
9136   SeqEntryPtr nuc_list;
9137   SeqEntryPtr prot_list;
9138   DialoG      dlg;
9139   ButtoN      accept_btn;
9140   NucProtAssocPtr  assoc_list;
9141   TexT        all_gene_txt;
9142   TexT        all_prot_txt;
9143 } NucProtEditData, PNTR NucProtEditPtr;
9144 
9145 static void PopulateNucProtEdit (NucProtEditPtr npep)
9146 {
9147   IDAndTitleEditPtr     iatep_nuc, iatep_prot;
9148   ValNodePtr            row_list = NULL;
9149   NucProtAssocPtr       vnp_assoc;
9150   TagListPtr            tlp;
9151   CharPtr               data_string, gene_locus, prot_name;
9152   Int4                  data_len;
9153   Int4                  prot_num;
9154   Int4                  old_scroll_pos = 0;
9155   
9156   if (npep == NULL)
9157   {
9158     return;
9159   }
9160   
9161   tlp = (TagListPtr) GetObjectExtra (npep->dlg);
9162   if (tlp == NULL)
9163   {
9164     return;
9165   }
9166   
9167   /* need to get bar value and reset after populating */
9168   if (tlp->bar != NULL)  
9169   {
9170     old_scroll_pos = GetBarValue (tlp->bar);
9171   }
9172   
9173   iatep_nuc = SeqEntryListToIDAndTitleEditEx (npep->nuc_list, TRUE);
9174   iatep_prot = SeqEntryListToIDAndTitleEdit (npep->prot_list);
9175   if (iatep_nuc != NULL && iatep_prot != NULL)
9176   {
9177     vnp_assoc = npep->assoc_list;
9178     for (prot_num = 0; prot_num < iatep_prot->num_sequences; prot_num++)
9179     {
9180       /* first column is protein ID */
9181       /* second column is choice for nucleotide ID */
9182       /* third column is gene locus tag */
9183       /* fourth column is protein name */
9184       /* fifth column indicates presence of suggested interval */
9185       gene_locus = FindValueFromPairInDefline ("gene", iatep_prot->title_list [prot_num]);
9186       prot_name = FindValueFromPairInDefline ("protein", iatep_prot->title_list [prot_num]);
9187       
9188       data_len = StringLen (iatep_prot->id_list [prot_num])
9189                   + 20
9190                   + StringLen (gene_locus)
9191                   + StringLen (prot_name);
9192       data_string = (CharPtr) MemNew (data_len * sizeof (Char));                  
9193       if (data_string != NULL)
9194       {
9195         sprintf (data_string, "%d\t%s\t%d\t%s\t%s\n",
9196                                prot_num + 1,
9197                                iatep_prot->id_list [prot_num],
9198                                vnp_assoc == NULL ? 0 : vnp_assoc->position,
9199                                gene_locus == NULL ? "" : gene_locus,
9200                                prot_name == NULL ? "" : prot_name);
9201         ValNodeAddPointer (&row_list, 0, data_string);                               
9202       }
9203       gene_locus = MemFree (gene_locus);
9204       prot_name = MemFree (prot_name);
9205       if (vnp_assoc != NULL)
9206       {
9207         vnp_assoc = vnp_assoc->next;
9208       }
9209     }
9210     SendMessageToDialog (npep->dlg, VIB_MSG_RESET);
9211     tlp->vnp = row_list;
9212 
9213     if (iatep_prot->num_sequences > tlp->rows)
9214     {
9215       tlp->max = MAX ((Int2) 0, (Int2) (iatep_prot->num_sequences - tlp->rows));  
9216       CorrectBarMax (tlp->bar, tlp->max);
9217       CorrectBarPage (tlp->bar, tlp->rows - 1, tlp->rows - 1); 
9218       Enable (tlp->bar);
9219       SetBarValue (tlp->bar, old_scroll_pos);
9220     }
9221     else
9222     {
9223       Hide (tlp->bar);
9224     }
9225     SendMessageToDialog (npep->dlg, VIB_MSG_REDRAW);    
9226   }
9227   
9228   iatep_nuc = IDAndTitleEditFree (iatep_nuc);
9229   iatep_prot = IDAndTitleEditFree (iatep_prot);
9230 }
9231 
9232 static CharPtr 
9233 GetTagListValueEx (TagListPtr tlp, Int4 seq_num, Int4 col_num);
9234 
9235 static void ApplyGeneNameToAllSequences (ButtoN b)
9236 {
9237   NucProtEditPtr npep;
9238   CharPtr        all_gene_name, new_val;
9239   TagListPtr     tlp;
9240   Int4           seq_num;
9241   ValNodePtr     vnp;
9242   
9243   npep = (NucProtEditPtr) GetObjectExtra (b);
9244   if (npep == NULL)
9245   {
9246     return;
9247   }
9248   
9249   tlp = (TagListPtr) GetObjectExtra (npep->dlg);
9250   if (tlp == NULL)
9251   {
9252     return;
9253   }
9254   all_gene_name = SaveStringFromText (npep->all_gene_txt);
9255   if (ANS_YES == Message (MSG_YN, "Are you sure you want to set all of the gene locus values to %s?",
9256                           all_gene_name))
9257   {
9258     for (vnp = tlp->vnp, seq_num = 0;
9259          vnp != NULL;
9260          vnp = vnp->next, seq_num++)
9261     {
9262       new_val = ReplaceTagListColumn (vnp->data.ptrvalue, all_gene_name, NUCPROTEDIT_GENE_COLUMN);
9263       vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
9264       vnp->data.ptrvalue = new_val;
9265     }
9266     SendMessageToDialog (npep->dlg, VIB_MSG_REDRAW);
9267   }
9268   all_gene_name = MemFree (all_gene_name);
9269 }
9270 
9271 static void ApplyProteinNameToAllSequences (ButtoN b)
9272 {
9273   NucProtEditPtr npep;
9274   CharPtr        all_prot_name, new_val;
9275   TagListPtr     tlp;
9276   Int4           seq_num;
9277   ValNodePtr     vnp;
9278   
9279   npep = (NucProtEditPtr) GetObjectExtra (b);
9280   if (npep == NULL)
9281   {
9282     return;
9283   }
9284   
9285   tlp = (TagListPtr) GetObjectExtra (npep->dlg);
9286   if (tlp == NULL)
9287   {
9288     return;
9289   }
9290   all_prot_name = SaveStringFromText (npep->all_prot_txt);
9291   if (ANS_YES == Message (MSG_YN, "Are you sure you want to set all of the protein names to %s?",
9292                           all_prot_name))
9293   {
9294     for (vnp = tlp->vnp, seq_num = 0;
9295          vnp != NULL;
9296          vnp = vnp->next, seq_num++)
9297     {
9298       new_val = ReplaceTagListColumn (vnp->data.ptrvalue, all_prot_name, NUCPROTEDIT_PROT_COLUMN);
9299       vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
9300       vnp->data.ptrvalue = new_val;
9301     }
9302     SendMessageToDialog (npep->dlg, VIB_MSG_REDRAW);   
9303   }
9304   all_prot_name = MemFree (all_prot_name);
9305 }
9306 
9307 static void ApplyNucProtEditGeneAndProt (NucProtEditPtr npep)
9308 {