|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/sequin/sequin2.c |
source navigation diff markup identifier search freetext search file search |
1 /* sequin2.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: sequin2.c
27 *
28 * Author: Jonathan Kans
29 *
30 * Version Creation Date: 1/22/95
31 *
32 * $Revision: 6.670 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45 #include "sequin.h"
46 #include <document.h>
47 #include <sequtil.h>
48 #include <biosrc.h>
49 #include <cdrgn.h>
50 #include <seqsub.h>
51 #include <tofasta.h>
52 #include <gather.h>
53 #include <subutil.h>
54 #include <suggslp.h>
55 #include <toasn3.h>
56 #include <toporg.h>
57 #include <salfiles.h>
58 #include <salsap.h>
59 #include <salign.h>
60 #include <edutil.h>
61 #include <vsm.h>
62 //#include <accentr.h>
63 //#include <accutils.h>
64 #include <pmfapi.h>
65 #include <explore.h>
66 #include <aliparse.h>
67 #include <algo/blast/api/twoseq_api.h>
68 #ifdef WIN_MOTIF
69 #include <netscape.h>
70 #endif
71 #include <actutils.h>
72 #include <salpanel.h>
73 #include <findrepl.h>
74 #include <macrodlg.h>
75 #include <macroapi.h>
76
77 extern EnumFieldAssoc biosource_genome_simple_alist [];
78 extern EnumFieldAssoc biosource_origin_alist [];
79
80 static ENUM_ALIST(biomol_nucX_alist)
81 {"Genomic DNA", 253},
82 {"Genomic RNA", 254},
83 {"Precursor RNA", 2},
84 {"mRNA [cDNA]", 3},
85 {"Ribosomal RNA", 4},
86 {"Transfer RNA", 5},
87 {"Small nuclear RNA", 6},
88 {"Small cytoplasmic RNA", 7},
89 {"Other-Genetic", 9},
90 {"cRNA", 11},
91 {"Small nucleolar RNA", 12},
92 {"Transcribed RNA", 13},
93 {"Transfer-messenger RNA", MOLECULE_TYPE_TMRNA },
94 END_ENUM_ALIST
95
96 static ENUM_ALIST(biomol_nucGen_alist)
97 {"Genomic DNA", 253},
98 {"Genomic RNA", 254},
99 END_ENUM_ALIST
100
101 static ENUM_ALIST(topology_nuc_alist)
102 {"Linear", TOPOLOGY_LINEAR},
103 {"Circular", TOPOLOGY_CIRCULAR},
104 END_ENUM_ALIST
105
106 static ENUM_ALIST(molecule_alist)
107 {"DNA", Seq_mol_dna },
108 {"RNA", Seq_mol_rna },
109 END_ENUM_ALIST
110
111 #define PRINTED_INT_MAX_LEN 15
112
113 #define CREATE_FASTA_REQUIRED 0
114 #define CREATE_FASTA_WARNING 1
115
116 /* This structure holds a list of IDs and titles for a set of sequences.
117 * It can be used to represent the new list of sequences being imported,
118 * the existing list of sequences being imported, suggested changes for
119 * a list of sequences, etc.
120 */
121 typedef struct idandtitleedit
122 {
123 CharPtr PNTR id_list;
124 CharPtr PNTR title_list;
125 BoolPtr is_seg;
126 Int4 num_sequences;
127 Boolean nuc_only;
128 } IDAndTitleEditData, PNTR IDAndTitleEditPtr;
129
130 /* These functions are for creating, copying, and freeing lists
131 * of titles and IDs.
132 */
133 static IDAndTitleEditPtr IDAndTitleEditNew (void)
134 {
135 IDAndTitleEditPtr iatep;
136
137 iatep = (IDAndTitleEditPtr) MemNew (sizeof (IDAndTitleEditData));
138 if (iatep != NULL)
139 {
140 iatep->id_list = NULL;
141 iatep->title_list = NULL;
142 iatep->is_seg = NULL;
143 iatep->num_sequences = 0;
144 iatep->nuc_only = FALSE;
145 }
146 return iatep;
147 }
148
149 static void IDAndTitleEditInit (IDAndTitleEditPtr iatep, Int4 new_num_sequences)
150 {
151 Int4 seq_num;
152 if (iatep == NULL)
153 {
154 return;
155 }
156
157 /* free old lists, if any */
158 for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
159 {
160 iatep->id_list [seq_num] = MemFree (iatep->id_list [seq_num]);
161 iatep->title_list [seq_num] = MemFree (iatep->title_list [seq_num]);
162 }
163 iatep->id_list = MemFree (iatep->id_list);
164 iatep->title_list = MemFree (iatep->title_list);
165 iatep->is_seg = MemFree (iatep->is_seg);
166
167 /* now create blanks for num_sequences entries */
168 iatep->num_sequences = MAX (0, new_num_sequences);
169 if (iatep->num_sequences > 0)
170 {
171 iatep->id_list = (CharPtr PNTR) MemNew (iatep->num_sequences * sizeof (CharPtr));
172 iatep->title_list = (CharPtr PNTR) MemNew (iatep->num_sequences * sizeof (CharPtr));
173 iatep->is_seg = (BoolPtr) MemNew (iatep->num_sequences * sizeof (Boolean));
174 for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
175 {
176 iatep->id_list [seq_num] = NULL;
177 iatep->title_list [seq_num] = NULL;
178 iatep->is_seg [seq_num] = FALSE;
179 }
180 }
181 }
182
183 static IDAndTitleEditPtr IDAndTitleEditCopy (IDAndTitleEditPtr iatep_orig)
184 {
185 IDAndTitleEditPtr iatep_copy;
186 Int4 seq_num;
187
188 if (iatep_orig == NULL)
189 {
190 return NULL;
191 }
192
193 iatep_copy = IDAndTitleEditNew ();
194 if (iatep_copy == NULL)
195 {
196 return NULL;
197 }
198
199 IDAndTitleEditInit (iatep_copy, iatep_orig->num_sequences);
200 for (seq_num = 0; seq_num < iatep_copy->num_sequences; seq_num++)
201 {
202 iatep_copy->id_list [seq_num] = StringSave (iatep_orig->id_list [seq_num]);
203 iatep_copy->title_list [seq_num] = StringSave (iatep_orig->title_list [seq_num]);
204 if (iatep_orig->is_seg != NULL)
205 {
206 iatep_copy->is_seg [seq_num] = iatep_orig->is_seg [seq_num];
207 }
208 }
209
210 return iatep_copy;
211 }
212
213 static IDAndTitleEditPtr IDAndTitleEditFree (IDAndTitleEditPtr iatep)
214 {
215 Int4 i;
216
217 if (iatep != NULL)
218 {
219 for (i = 0; i < iatep->num_sequences; i++)
220 {
221 iatep->id_list [i] = MemFree (iatep->id_list [i]);
222 iatep->title_list [i] = MemFree (iatep->title_list [i]);
223 }
224 iatep->id_list = MemFree (iatep->id_list);
225 iatep->title_list = MemFree (iatep->title_list);
226 iatep->is_seg = MemFree (iatep->is_seg);
227 iatep = MemFree (iatep);
228 }
229 return iatep;
230 }
231
232 /* These functions are for applying lists of titles and IDs
233 * to a SeqEntry list.
234 */
235 static Int4 CountSequencesAndSegments (SeqEntryPtr list, Boolean nuc_only)
236 {
237 Int4 num_seqs = 0;
238 BioseqSetPtr bssp;
239 BioseqPtr bsp;
240
241 while (list != NULL)
242 {
243 if (list->data.ptrvalue != NULL)
244 {
245 if (IS_Bioseq (list))
246 {
247 bsp = (BioseqPtr) list->data.ptrvalue;
248 if (!nuc_only || ISA_na (bsp->mol)) {
249 num_seqs ++;
250 }
251 }
252 else if (IS_Bioseq_set (list))
253 {
254 bssp = (BioseqSetPtr) list->data.ptrvalue;
255 num_seqs += CountSequencesAndSegments (bssp->seq_set, nuc_only);
256 }
257 }
258 list = list->next;
259 }
260 return num_seqs;
261 }
262
263 static BioseqPtr FindNthSequenceInSet (SeqEntryPtr seq_list, Int4 nth, BoolPtr is_seg, Boolean nuc_only)
264 {
265 Int4 pos = 0;
266 BioseqPtr bsp = NULL;
267 BioseqSetPtr bssp;
268 SeqEntryPtr sep;
269
270 while (seq_list != NULL && bsp == NULL)
271 {
272 if (seq_list->data.ptrvalue != NULL)
273 {
274 if (IS_Bioseq (seq_list) && seq_list->data.ptrvalue != NULL
275 && (!nuc_only || ISA_na(((BioseqPtr)seq_list->data.ptrvalue)->mol)))
276 {
277 if (nth == pos)
278 {
279 bsp = seq_list->data.ptrvalue;
280 }
281 else
282 {
283 pos ++;
284 }
285 }
286 else if (IS_Bioseq_set (seq_list))
287 {
288 bssp = (BioseqSetPtr) seq_list->data.ptrvalue;
289 if (bssp->_class == BioseqseqSet_class_parts && is_seg != NULL)
290 {
291 *is_seg = TRUE;
292 }
293 sep = bssp->seq_set;
294 while (sep != NULL && bsp == NULL)
295 {
296 bsp = FindNthSequenceInSet (sep, nth - pos, is_seg, nuc_only);
297 if (bsp == NULL)
298 {
299 if (IS_Bioseq_set (sep))
300 {
301 bssp = (BioseqSetPtr) sep->data.ptrvalue;
302 pos += CountSequencesAndSegments (bssp->seq_set, nuc_only);
303 }
304 else if (IS_Bioseq (sep) && (!nuc_only || ISA_na (((BioseqPtr)(sep->data.ptrvalue))->mol)))
305 {
306 pos ++;
307 }
308 }
309 sep = sep->next;
310 }
311 if (bsp == NULL && is_seg != NULL)
312 {
313 *is_seg = FALSE;
314 }
315 }
316 }
317 seq_list = seq_list->next;
318 }
319 return bsp;
320 }
321
322 static IDAndTitleEditPtr SeqEntryListToIDAndTitleEditEx (SeqEntryPtr list, Boolean nuc_only)
323 {
324 IDAndTitleEditPtr iatep;
325 Int4 num_sequences, i;
326 BioseqPtr bsp;
327 SeqDescrPtr sdp;
328 SeqIdPtr sip;
329
330 num_sequences = CountSequencesAndSegments (list, nuc_only);
331 if (num_sequences == 0)
332 {
333 return NULL;
334 }
335
336 iatep = IDAndTitleEditNew ();
337 if (iatep == NULL)
338 {
339 return NULL;
340 }
341
342 iatep->nuc_only = nuc_only;
343 IDAndTitleEditInit (iatep, num_sequences);
344
345 for (i = 0; i < num_sequences; i++)
346 {
347 bsp = FindNthSequenceInSet (list, i, &(iatep->is_seg [i]), nuc_only);
348 if (bsp != NULL)
349 {
350 sip = SeqIdFindBest (bsp->id, SEQID_GENBANK);
351 if (sip != NULL)
352 {
353 if (sip->choice == SEQID_LOCAL) {
354 iatep->id_list [i] = SeqIdWholeLabel (sip, PRINTID_REPORT);
355 } else {
356 iatep->id_list [i] = SeqIdWholeLabel (sip, PRINTID_FASTA_SHORT);
357 }
358 }
359 sdp = bsp->descr;
360 while (sdp != NULL && sdp->choice != Seq_descr_title)
361 {
362 sdp = sdp->next;
363 }
364 if (sdp != NULL && !StringHasNoText (sdp->data.ptrvalue))
365 {
366 iatep->title_list [i] = StringSave (sdp->data.ptrvalue);
367 }
368 }
369 }
370 return iatep;
371 }
372 static IDAndTitleEditPtr SeqEntryListToIDAndTitleEdit (SeqEntryPtr list)
373 {
374 return SeqEntryListToIDAndTitleEditEx (list, FALSE);
375 }
376
377 static void ReplaceIDAndTitleForBioseq (BioseqPtr bsp, SeqIdPtr new_sip, CharPtr title)
378 {
379 SeqDescrPtr sdp;
380 SeqEntryPtr sep;
381
382 if (bsp == NULL)
383 {
384 return;
385 }
386
387 /* replace ID */
388
389 if (new_sip != NULL)
390 {
391 if (bsp->id != NULL)
392 {
393 new_sip->next = bsp->id->next;
394 bsp->id->next = NULL;
395 bsp->id = SeqIdFree (bsp->id);
396 }
397 bsp->id = new_sip;
398 SeqMgrReplaceInBioseqIndex(bsp);
399 }
400 else
401 {
402 bsp->id = SeqIdFree (bsp->id);
403 }
404
405 /* replace title */
406 if (title == NULL)
407 {
408 title = StringSave ("");
409 }
410 sdp = bsp->descr;
411 while (sdp != NULL && sdp->choice != Seq_descr_title)
412 {
413 sdp = sdp->next;
414 }
415 if (sdp == NULL)
416 {
417 sep = SeqMgrGetSeqEntryForData (bsp);
418 sdp = CreateNewDescriptor (sep, Seq_descr_title);
419 sdp->data.ptrvalue = title;
420 }
421 else
422 {
423 sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
424 sdp->data.ptrvalue = title;
425 }
426 }
427
428 static void ResetSegSetIDLists (SeqEntryPtr list)
429 {
430 BioseqSetPtr bssp, parts;
431 BioseqPtr seg_bsp;
432 SeqEntryPtr sep;
433 SeqLocPtr loc, next_loc, last_loc;
434
435 if (list == NULL)
436 {
437 return;
438 }
439
440 if (list->data.ptrvalue != NULL)
441 {
442 if (IS_Bioseq_set (list))
443 {
444 bssp = (BioseqSetPtr) list->data.ptrvalue;
445 if (bssp->_class == BioseqseqSet_class_segset)
446 {
447 sep = bssp->seq_set;
448 seg_bsp = NULL;
449 parts = NULL;
450 while (sep != NULL && (seg_bsp == NULL || parts == NULL))
451 {
452 if (IS_Bioseq (sep))
453 {
454 seg_bsp = sep->data.ptrvalue;
455 }
456 else if (IS_Bioseq_set (sep))
457 {
458 parts = sep->data.ptrvalue;
459 if (parts != NULL && parts->_class != BioseqseqSet_class_parts)
460 {
461 parts = NULL;
462 }
463 }
464 sep = sep->next;
465 }
466 if (seg_bsp != NULL)
467 {
468 /* remove old location */
469 loc = (SeqLocPtr) seg_bsp->seq_ext;
470 while (loc != NULL)
471 {
472 next_loc = loc->next;
473 loc->next = NULL;
474 loc = SeqLocFree (loc);
475 loc = next_loc;
476 }
477 seg_bsp->seq_ext = NULL;
478 /* put in new locations */
479 sep = parts->seq_set;
480 last_loc = NULL;
481 while (sep != NULL)
482 {
483 if (IS_Bioseq (sep) && sep->data.ptrvalue != NULL)
484 {
485 loc = SeqLocWholeNew (sep->data.ptrvalue);
486 if (loc != NULL)
487 {
488 if (last_loc == NULL)
489 {
490 seg_bsp->seq_ext = loc;
491 }
492 else
493 {
494 last_loc->next = loc;
495 }
496 last_loc = loc;
497 }
498 }
499 sep = sep->next;
500 }
501 }
502 }
503 else
504 {
505 ResetSegSetIDLists (bssp->seq_set);
506 }
507 }
508 }
509 ResetSegSetIDLists (list->next);
510 }
511
512
513 static Boolean ApplyIDAndTitleEditToSeqEntryList (SeqEntryPtr list, IDAndTitleEditPtr iatep)
514 {
515 Int4 i;
516 SeqIdPtr new_sip;
517 BioseqPtr bsp;
518
519 if (list == NULL || iatep == NULL)
520 {
521 return FALSE;
522 }
523
524 if (CountSequencesAndSegments (list, iatep->nuc_only) != iatep->num_sequences)
525 {
526 return FALSE;
527 }
528
529 for (i = 0; i < iatep->num_sequences; i++)
530 {
531 bsp = FindNthSequenceInSet (list, i, NULL, iatep->nuc_only);
532 if (bsp != NULL)
533 {
534 new_sip = NULL;
535 if (StringChr (iatep->id_list[i], '|') != NULL) {
536 new_sip = SeqIdParse (iatep->id_list[i]);
537 }
538 if (new_sip == NULL) {
539 new_sip = MakeSeqID (iatep->id_list [i]);
540 }
541 ReplaceIDAndTitleForBioseq (bsp, new_sip, StringSave (iatep->title_list [i]));
542 }
543 }
544 ResetSegSetIDLists (list);
545 return TRUE;
546 }
547
548 /* this section of code is used to read and parse the taxlist.txt
549 * and lineages.txt files */
550 static ValNodePtr orglist = NULL;
551
552 typedef struct orginfo
553 {
554 CharPtr taxname;
555 CharPtr common;
556 Int4 ngcode;
557 Int4 mgcode;
558 CharPtr div;
559 Int4 taxnum;
560 CharPtr lineage;
561 } OrgInfoData, PNTR OrgInfoPtr;
562
563 static FILE *OpenSequinDataFile (CharPtr filename)
564 {
565 Char str [PATH_MAX];
566 CharPtr ptr;
567 FILE *f = NULL;
568
569 if (StringHasNoText (filename))
570 {
571 return NULL;
572 }
573
574 ProgramPath (str, sizeof (str));
575 ptr = StringRChr (str, DIRDELIMCHR);
576 if (ptr == NULL)
577 {
578 return NULL;
579 }
580
581 *ptr = '\0';
582 FileBuildPath (str, NULL, filename);
583 f = FileOpen (str, "r");
584 if (f == NULL) {
585 if (GetAppParam ("NCBI", "NCBI", "DATA", "", str, sizeof (str))) {
586 FileBuildPath (str, NULL, filename);
587 f = FileOpen (str, "r");
588 }
589 }
590 return f;
591 }
592
593 static OrgInfoPtr FindByTaxNum (Int4 taxnum)
594 {
595 ValNodePtr vnp;
596 OrgInfoPtr oip;
597
598 for (vnp = orglist; vnp != NULL; vnp = vnp->next)
599 {
600 oip = (OrgInfoPtr) vnp->data.ptrvalue;
601 if (oip != NULL && oip->taxnum == taxnum)
602 {
603 return oip;
604 }
605 }
606 return NULL;
607 }
608
609 static OrgInfoPtr FindByTaxName (CharPtr taxname)
610 {
611 ValNodePtr vnp;
612 OrgInfoPtr oip;
613
614 if (StringHasNoText (taxname))
615 {
616 return NULL;
617 }
618
619 for (vnp = orglist; vnp != NULL; vnp = vnp->next)
620 {
621 oip = (OrgInfoPtr) vnp->data.ptrvalue;
622 if (oip != NULL && StringICmp (oip->taxname, taxname) == 0)
623 {
624 return oip;
625 }
626 }
627 return NULL;
628 }
629
630 static void AddLineagesToOrganismList (void)
631 {
632 ReadBufferData rbd;
633 CharPtr line;
634 CharPtr ptr;
635 FILE *f;
636 OrgInfoPtr oip;
637 Int4 taxnum;
638
639 /* can only add lineages to existing list */
640 if (orglist == NULL) return;
641
642 /* now read in lineages */
643 f = OpenSequinDataFile ("lineages.txt");
644
645 if (f != NULL)
646 {
647 rbd.fp = f;
648 rbd.current_data = NULL;
649 line = AbstractReadFunction (&rbd);
650 line = AbstractReadFunction (&rbd);
651 while (line != NULL)
652 {
653 ptr = StringChr (line, '\t');
654 if (ptr != NULL)
655 {
656 *ptr = '\0';
657 if (StrToLong (line, &taxnum))
658 {
659 oip = FindByTaxNum (taxnum);
660 if (oip != NULL)
661 {
662 oip->lineage = StringSave (ptr + 1);
663 }
664 }
665 }
666 line = AbstractReadFunction (&rbd);
667 }
668 FileClose (f);
669 }
670 }
671
672 static CharPtr GetNextToken (CharPtr PNTR pstart)
673 {
674 CharPtr pend;
675 CharPtr newval = NULL;
676
677 if (pstart == NULL || *pstart == NULL)
678 {
679 return NULL;
680 }
681
682 pend = StringChr (*pstart, '\t');
683 if (pend != NULL)
684 {
685 *pend = 0;
686 }
687 newval = StringSave (*pstart);
688 if (pend == NULL)
689 {
690 *pstart = NULL;
691 }
692 else
693 {
694 *pstart = pend + 1;
695 }
696 return newval;
697 }
698
699 static void LoadOrganismList (void)
700 {
701 ReadBufferData rbd;
702 CharPtr line;
703 CharPtr p_start, numval;
704 FILE *f;
705 OrgInfoPtr oip;
706
707 if (orglist != NULL) return;
708
709 f = OpenSequinDataFile ("taxlist.txt");
710
711 if (f != NULL) {
712 rbd.fp = f;
713 rbd.current_data = NULL;
714 line = AbstractReadFunction (&rbd);
715 line = AbstractReadFunction (&rbd);
716 while (line != NULL)
717 {
718 oip = (OrgInfoPtr) MemNew (sizeof (OrgInfoData));
719 if (oip != NULL)
720 {
721 p_start = line;
722 /* read in tax name */
723 oip->taxname = GetNextToken (&p_start);
724
725 /* read in common name */
726 oip->common = GetNextToken (&p_start);
727
728 /* read in nuclear genetic code */
729 numval = GetNextToken (&p_start);
730 if (numval != NULL)
731 {
732 StrToLong (numval, &(oip->ngcode));
733 numval = MemFree (numval);
734 }
735 /* read in mitochondrial genetic code */
736 numval = GetNextToken (&p_start);
737 if (numval != NULL)
738 {
739 StrToLong (numval, &(oip->mgcode));
740 numval = MemFree (numval);
741 }
742
743 /* read in div */
744 oip->div = GetNextToken (&p_start);
745
746 /* read in taxnum */
747 numval = GetNextToken (&p_start);
748 if (numval != NULL)
749 {
750 StrToLong (numval, &(oip->taxnum));
751 numval = MemFree (numval);
752 }
753
754 ValNodeAddPointer (&orglist, 0, oip);
755 }
756 line = MemFree (line);
757 line = AbstractReadFunction (&rbd);
758 }
759 FileClose (f);
760 }
761 AddLineagesToOrganismList ();
762 }
763
764 /* This section of code is used for determining genetic codes based on
765 * FASTA-defline values.
766 */
767 #define USE_NUCLEAR_GENETIC_CODE 1
768 #define USE_MITOCHONDRIAL_GENETIC_CODE 2
769 #define USE_OTHER_GENETIC_CODE 3
770
771 static Int4 UseGeneticCodeForLocation (CharPtr location)
772 {
773 if (StringHasNoText (location))
774 {
775 return USE_NUCLEAR_GENETIC_CODE;
776 }
777 else if (StringICmp (location, "Mitochondrion") == 0
778 || StringICmp (location, "Kinetoplast") == 0
779 || StringICmp (location, "Hydrogenosome") == 0)
780 {
781 return USE_MITOCHONDRIAL_GENETIC_CODE;
782 }
783 else if (StringICmp (location, "Chloroplast") == 0
784 || StringICmp (location, "Chromoplast") == 0
785 || StringICmp (location, "plastid") == 0
786 || StringICmp (location, "cyanelle") == 0
787 || StringICmp (location, "apicoplast") == 0
788 || StringICmp (location, "leucoplast") == 0
789 || StringICmp (location, "proplastid") == 0)
790 {
791 return USE_OTHER_GENETIC_CODE;
792 }
793 else
794 {
795 return USE_NUCLEAR_GENETIC_CODE;
796 }
797 }
798
799
800 static Int4 GetGeneticCodeForTaxNameAndLocation (CharPtr taxname, CharPtr location)
801 {
802 ValNodePtr vnp;
803 OrgInfoPtr oip;
804 Int4 use_code;
805
806 use_code = UseGeneticCodeForLocation (location);
807 if (use_code == USE_OTHER_GENETIC_CODE)
808 {
809 return 11;
810 }
811 else if (StringHasNoText (taxname))
812 {
813 return -1;
814 }
815
816 for (vnp = orglist; vnp != NULL; vnp = vnp->next)
817 {
818 if (vnp->data.ptrvalue == NULL)
819 {
820 continue;
821 }
822 oip = (OrgInfoPtr) vnp->data.ptrvalue;
823 if (StringICmp (oip->taxname, taxname) == 0)
824 {
825 if (use_code == USE_NUCLEAR_GENETIC_CODE)
826 {
827 return oip->ngcode;
828 }
829 else
830 {
831 return oip->mgcode;
832 }
833 }
834 }
835
836 return -1;
837 }
838
839 static CharPtr GeneticCodeStringFromIntAndList (Int4 num, ValNodePtr list)
840 {
841 while (list != NULL)
842 {
843 if (list->choice == num)
844 {
845 return list->data.ptrvalue;
846 }
847 list = list->next;
848 }
849 return NULL;
850 }
851
852
853 /* these functions deal with commonly asked questions about package types -
854 * which ones are sets, which ones are single sequences, which ones have
855 * which default molecule types.
856 */
857 static Boolean PackageTypeIsSet (Int2 seqPackage)
858 {
859 if (seqPackage == SEQ_PKG_POPULATION
860 || seqPackage == SEQ_PKG_PHYLOGENETIC
861 || seqPackage == SEQ_PKG_MUTATION
862 || seqPackage == SEQ_PKG_ENVIRONMENT
863 || seqPackage == SEQ_PKG_GENBANK)
864 {
865 return TRUE;
866 }
867 else
868 {
869 return FALSE;
870 }
871
872 }
873
874 static Boolean PackageTypeIsSingle (Int2 seqPackage)
875 {
876 if (seqPackage == SEQ_PKG_SINGLE
877 || seqPackage == SEQ_PKG_SEGMENTED
878 || seqPackage == SEQ_PKG_GAPPED)
879 {
880 return TRUE;
881 }
882 else
883 {
884 return FALSE;
885 }
886 }
887
888 /* These functions are used to find titles in SeqEntries */
889 static void FindFirstTitle (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
890
891 {
892 CharPtr PNTR ttlptr;
893
894 if (mydata == NULL) return;
895 ttlptr = (CharPtr PNTR) mydata;
896 if (*ttlptr != NULL) return;
897 *ttlptr = SeqEntryGetTitle (sep);
898 }
899
900 static void FindFirstSeqEntryTitle (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
901
902 {
903 SeqEntryPtr PNTR sepptr;
904
905 if (mydata == NULL) return;
906 sepptr = (SeqEntryPtr PNTR) mydata;
907 if (*sepptr != NULL) return;
908 if (SeqEntryGetSeqDescr (sep, Seq_descr_title, NULL) != NULL) {
909 *sepptr = sep;
910 }
911 }
912
913 /* These functions are used to change the values of modifiers in definition lines */
914
915 extern void MakeSearchStringFromAlist (CharPtr str, CharPtr name)
916
917 {
918 Char ch;
919 CharPtr ptr;
920
921 StringCpy (str, "[");
922 StringCat (str, name);
923 StringCat (str, "=");
924 ptr = str;
925 ch = *ptr;
926 while (*ptr != '\0') {
927 *ptr = TO_LOWER (ch);
928 ptr++;
929 ch = *ptr;
930 }
931 }
932
933 /* This section of code is used for parsing well-formatted definition lines.
934 */
935 typedef struct modifieralias
936 {
937 CharPtr alias;
938 CharPtr modifier;
939 } ModifierAlias, PNTR ModifierAliasPtr;
940
941 static ModifierAlias alias_list [] =
942 {
943 { "org", "organism" },
944 { "mol-type", "moltype" },
945 { "mol_type", "moltype" },
946 { "note", "note-orgmod" },
947 { "comment", "note-orgmod" },
948 { "common-name", "common name"},
949 { "subsource", "note-subsrc" },
950 { "technique", "tech" },
951 { "prot", "protein" },
952 { "prot_desc", "protein_desc" }
953 };
954
955 static Int4 num_aliases = sizeof (alias_list) / sizeof (ModifierAlias);
956
957 static CharPtr protein_modifier_names [] =
958 {
959 "gene",
960 "gene_syn",
961 "protein",
962 "protein_desc",
963 "note",
964 "comment",
965 "orf",
966 "function",
967 "EC_number"
968 };
969
970 static Int4 num_protein_modifier_names = sizeof (protein_modifier_names) / sizeof (CharPtr);
971
972 static CharPtr GetCanonicalName (CharPtr mod_name)
973 {
974 Int4 j;
975 Uint1 subtype;
976
977 if (StringHasNoText (mod_name))
978 {
979 return StringSave ("");
980 }
981
982 for (j = 0; j < num_protein_modifier_names; j++) {
983 if (StringsAreEquivalent (mod_name, protein_modifier_names[j])) {
984 return StringSave (protein_modifier_names[j]);
985 }
986 }
987
988 subtype = EquivalentOrgMod (mod_name);
989 if (subtype != 0) {
990 return StringSave (GetOrgModQualName (subtype));
991 }
992
993 subtype = EquivalentSubSource (mod_name);
994 if (subtype != 0) {
995 return StringSave (GetSubsourceQualName (subtype));
996 }
997
998 for (j = 0; j < num_aliases; j++)
999 {
1000 if (StringsAreEquivalent (alias_list [j].alias, mod_name))
1001 {
1002 return StringSave (alias_list [j].modifier);
1003 }
1004 }
1005 return StringSave (mod_name);
1006 }
1007
1008 typedef enum {
1009 eModifierType_SourceQual = 0,
1010 eModifierType_Organism,
1011 eModifierType_Location,
1012 eModifierType_Lineage,
1013 eModifierType_GeneticCode,
1014 eModifierType_GeneticCodeComment,
1015 eModifierType_NucGeneticCode,
1016 eModifierType_MitoGeneticCode,
1017 eModifierType_MolType,
1018 eModifierType_Molecule,
1019 eModifierType_Origin,
1020 eModifierType_Topology,
1021 eModifierType_CommonName,
1022 eModifierType_Technique,
1023 eModifierType_Protein
1024 } EModifierType;
1025
1026 typedef struct modifierinfo
1027 {
1028 CharPtr name;
1029 Uint1 subtype;
1030 CharPtr value;
1031 EModifierType modtype;
1032 } ModifierInfoData, PNTR ModifierInfoPtr;
1033
1034 static ModifierInfoPtr ModifierInfoNew (void)
1035 {
1036 ModifierInfoPtr mip;
1037 mip = (ModifierInfoPtr) MemNew (sizeof (ModifierInfoData));
1038 if (mip == NULL) return NULL;
1039 mip->name = NULL;
1040 mip->value = NULL;
1041 mip->modtype = eModifierType_SourceQual;
1042 return mip;
1043 }
1044
1045 static ModifierInfoPtr ModifierInfoFree (ModifierInfoPtr mip)
1046 {
1047 if (mip == NULL) return NULL;
1048 mip->name = MemFree (mip->name);
1049 mip->value = MemFree (mip->value);
1050 mip = MemFree (mip);
1051 return mip;
1052 }
1053
1054 static ValNodePtr ModifierInfoListFree (ValNodePtr list)
1055 {
1056 if (list == NULL) return NULL;
1057 ModifierInfoListFree (list->next);
1058 list->next = NULL;
1059 list->data.ptrvalue = ModifierInfoFree (list->data.ptrvalue);
1060 ValNodeFree (list);
1061 return NULL;
1062 }
1063
1064 static EModifierType GetModifierType (CharPtr mod_name)
1065 {
1066 Int4 i;
1067 CharPtr canonical_name;
1068 EModifierType returntype;
1069
1070 canonical_name = GetCanonicalName (mod_name);
1071
1072 if (StringHasNoText (canonical_name))
1073 {
1074 returntype = eModifierType_SourceQual;
1075 }
1076 else if (StringICmp (canonical_name, "organism") == 0
1077 || StringICmp (canonical_name, "org") == 0)
1078 {
1079 returntype = eModifierType_Organism;
1080 }
1081 else if (StringICmp (canonical_name, "location") == 0)
1082 {
1083 returntype = eModifierType_Location;
1084 }
1085 else if (StringICmp (canonical_name, "lineage") == 0)
1086 {
1087 returntype = eModifierType_Lineage;
1088 }
1089 else if (StringICmp (canonical_name, "gcode") == 0)
1090 {
1091 returntype = eModifierType_NucGeneticCode;
1092 }
1093 else if (StringICmp (canonical_name, "mgcode") == 0)
1094 {
1095 returntype = eModifierType_MitoGeneticCode;
1096 }
1097 else if (StringICmp (canonical_name, "genetic_code") == 0)
1098 {
1099 returntype = eModifierType_GeneticCode;
1100 }
1101 else if (StringICmp (canonical_name, "gencode_comment") == 0)
1102 {
1103 returntype = eModifierType_GeneticCodeComment;
1104 }
1105 else if (StringICmp (canonical_name, "moltype") == 0)
1106 {
1107 returntype = eModifierType_MolType;
1108 }
1109 else if (StringICmp (canonical_name, "molecule") == 0)
1110 {
1111 returntype = eModifierType_Molecule;
1112 }
1113 else if (StringICmp (canonical_name, "origin") == 0)
1114 {
1115 returntype = eModifierType_Origin;
1116 }
1117 else if (StringICmp (canonical_name, "topology") == 0)
1118 {
1119 returntype = eModifierType_Topology;
1120 }
1121 else if (StringICmp (canonical_name, "common name") == 0)
1122 {
1123 returntype = eModifierType_CommonName;
1124 }
1125 else if (StringICmp (canonical_name, "tech") == 0)
1126 {
1127 returntype = eModifierType_Technique;
1128 }
1129 else
1130 {
1131 for (i = 0; i < num_protein_modifier_names; i++)
1132 {
1133 if (StringICmp (canonical_name, protein_modifier_names[i]) == 0)
1134 {
1135 returntype = eModifierType_Protein;
1136 canonical_name = MemFree (canonical_name);
1137 return returntype;
1138 }
1139 }
1140 returntype = eModifierType_SourceQual;
1141 }
1142
1143 canonical_name = MemFree (canonical_name);
1144 return returntype;
1145 }
1146
1147 static Boolean AllowMultipleValues (CharPtr mod_name)
1148 {
1149 EModifierType mod_type;
1150 Boolean rval = FALSE;
1151
1152 mod_type = GetModifierType (mod_name);
1153 switch (mod_type)
1154 {
1155 case eModifierType_SourceQual:
1156 if (! IsNonTextModifier (mod_name))
1157 {
1158 rval = TRUE;
1159 }
1160 break;
1161 case eModifierType_CommonName:
1162 rval = TRUE;
1163 break;
1164 case eModifierType_Organism:
1165 rval = TRUE;
1166 break;
1167 default:
1168 rval = FALSE;
1169 break;
1170 }
1171 return rval;
1172 }
1173
1174 typedef enum
1175 {
1176 BRACKET_ERR_NO_ERR = 0,
1177 BRACKET_ERR_MISMATCHED_BRACKETS,
1178 BRACKET_ERR_MISSING_EQUALS,
1179 BRACKET_ERR_MULT_EQUALS,
1180 BRACKET_ERR_NO_MOD_NAME,
1181 BRACKET_ERR_MISMATCHED_QUOTES
1182 } bracketing_err_num;
1183
1184 static Char ExpectToken (CharPtr cp)
1185 {
1186 CharPtr valstart;
1187
1188 if (cp == NULL)
1189 {
1190 return 0;
1191 }
1192 else if (*cp == '[')
1193 {
1194 valstart = cp + 1 + StringSpn (cp + 1, " \t");
1195 if (StringLen (valstart) > 3
1196 && (StringNICmp (valstart, "dna", 3) == 0
1197 || StringNICmp (valstart, "rna", 3) == 0
1198 || StringNICmp (valstart, "orf", 3) == 0)
1199 && *(valstart + 3 + StringSpn (valstart + 3, " \t")) == ']')
1200 {
1201 return ']';
1202 }
1203 else
1204 {
1205 return '=';
1206 }
1207 }
1208 else if (*cp == '=')
1209 {
1210 return ']';
1211 }
1212 else if (*cp == ']')
1213 {
1214 return '[';
1215 }
1216 else
1217 {
1218 return 0;
1219 }
1220 }
1221
1222 /* When we are looking for double-quotation marks to use for delimiting
1223 * sections of a title that should not be parsed or values that may contain
1224 * brackets, equals signs, or other reserved characters, skip over
1225 * quotation marks that are preceded by the escape character (backslash).
1226 * This allows quotation marks to be included in a quoted string.
1227 */
1228 static CharPtr NextUnescapedQuote (CharPtr str)
1229 {
1230 CharPtr cp;
1231
1232 if (StringHasNoText (str))
1233 {
1234 return NULL;
1235 }
1236 cp = StringChr (str, '"');
1237 if (cp != NULL && cp != str)
1238 {
1239 while (cp != NULL && *(cp - 1) == '\\')
1240 {
1241 cp = StringChr (cp + 1, '"');
1242 }
1243 }
1244 return cp;
1245 }
1246
1247 /* This function steps backward from str_end until it has located
1248 * an unescaped double-quotation mark or it has reached the
1249 * start of the string (str_start).
1250 */
1251 static CharPtr FindPreviousUnescapedQuote (CharPtr str_start, CharPtr str_end)
1252 {
1253 CharPtr cp;
1254 if (str_start == NULL || str_end == NULL || str_end < str_start)
1255 {
1256 return NULL;
1257 }
1258
1259 cp = str_end;
1260 while (cp > str_start && (*cp != '"' || *(cp - 1) == '\\'))
1261 {
1262 cp--;
1263 }
1264 if (*cp != '"')
1265 {
1266 cp = NULL;
1267 }
1268 return cp;
1269 }
1270
1271
1272 /* This function finds the next bracketing token ([, =, or ]) in
1273 * the string that is not enclosed by unescaped quotation marks.
1274 */
1275 static CharPtr NextBracketToken (CharPtr str)
1276 {
1277 CharPtr next_quote;
1278 CharPtr cp;
1279
1280 if (StringHasNoText (str))
1281 {
1282 return NULL;
1283 }
1284
1285 cp = str;
1286 while (*cp != 0)
1287 {
1288 switch (*cp)
1289 {
1290 case '"':
1291 if (cp == str || (*(cp - 1) != '\\'))
1292 {
1293 next_quote = NextUnescapedQuote (cp + 1);
1294 if (next_quote == NULL)
1295 {
1296 return cp;
1297 }
1298 else
1299 {
1300 cp = next_quote + 1;;
1301 }
1302 }
1303 else
1304 {
1305 cp++;
1306 }
1307 break;
1308 case '[':
1309 case ']':
1310 case '=':
1311 return cp;
1312 default:
1313 cp++;
1314 }
1315 }
1316
1317 return NULL;
1318 }
1319
1320 static Int4 DetectBadBracketing (CharPtr str)
1321 {
1322 CharPtr cp;
1323 Char expected_token;
1324 CharPtr last_token = NULL, namestart;
1325
1326 if (StringHasNoText (str))
1327 {
1328 return BRACKET_ERR_NO_ERR;
1329 }
1330
1331 expected_token = '[';
1332 cp = NextBracketToken (str);
1333 while (cp != NULL)
1334 {
1335 switch (*cp)
1336 {
1337 case '"':
1338 return BRACKET_ERR_MISMATCHED_QUOTES;
1339 break;
1340 case '[':
1341 case ']':
1342 case '=':
1343 if (expected_token == *cp)
1344 {
1345 if (expected_token == '=' && last_token != NULL)
1346 {
1347 namestart = last_token + 1 + StringSpn (last_token + 1, " \t");
1348 if (namestart == cp)
1349 {
1350 return BRACKET_ERR_NO_MOD_NAME;
1351 }
1352 }
1353 expected_token = ExpectToken (cp);
1354 last_token = cp;
1355 }
1356 else if (expected_token == '=')
1357 {
1358 if (cp - last_token - 1 == StringSpn (last_token + 1, " \t"))
1359 {
1360 return BRACKET_ERR_MISMATCHED_BRACKETS;
1361 }
1362 else
1363 {
1364 return BRACKET_ERR_MISSING_EQUALS;
1365 }
1366 }
1367 else if (*cp == '=')
1368 {
1369 if (expected_token == ']')
1370 {
1371 return BRACKET_ERR_MULT_EQUALS;
1372 }
1373 else
1374 {
1375 return BRACKET_ERR_MISMATCHED_BRACKETS;
1376 }
1377 }
1378 else
1379 {
1380 return BRACKET_ERR_MISMATCHED_BRACKETS;
1381 }
1382 break;
1383 }
1384 cp = NextBracketToken (cp + 1);
1385 }
1386
1387 if (cp == NULL && expected_token != '[')
1388 {
1389 return BRACKET_ERR_MISMATCHED_BRACKETS;
1390 }
1391
1392 return BRACKET_ERR_NO_ERR;
1393 }
1394
1395 static ModifierInfoPtr
1396 ParseOneBracketedModifier
1397 (CharPtr str,
1398 CharPtr PNTR bracket_start,
1399 CharPtr PNTR bracket_stop)
1400 {
1401 CharPtr start, stop, eq_loc;
1402 ModifierInfoPtr mip;
1403 Int4 value_len, name_len;
1404 CharPtr canonical_name;
1405
1406 start = NextBracketToken (str);
1407 while (start != NULL && *start != '[')
1408 {
1409 start = NextBracketToken (start + 1);
1410 }
1411 if (start == NULL) return NULL;
1412 eq_loc = NextBracketToken (start + 1);
1413 if (eq_loc == NULL) return NULL;
1414 if (*eq_loc == ']')
1415 {
1416 stop = eq_loc;
1417 }
1418 else if (*eq_loc == '=')
1419 {
1420 stop = NextBracketToken (eq_loc + 1);
1421 }
1422 else
1423 {
1424 return NULL;
1425 }
1426
1427 if (stop == NULL || *stop != ']') return NULL;
1428
1429 mip = ModifierInfoNew();
1430 if (mip == NULL) return NULL;
1431
1432 /* copy in modifier name */
1433 name_len = eq_loc - start + 1;
1434 mip->name = (CharPtr) MemNew (name_len * sizeof (Char));
1435 if (mip->name == NULL)
1436 {
1437 mip = ModifierInfoFree (mip);
1438 return NULL;
1439 }
1440 StringNCpy (mip->name, start + 1, name_len - 2);
1441 mip->name [name_len - 1] = 0;
1442 TrimSpacesAroundString (mip->name);
1443 canonical_name = GetCanonicalName (mip->name);
1444 mip->name = MemFree (mip->name);
1445 mip->name = canonical_name;
1446 if (StringICmp (mip->name, "note") == 0)
1447 {
1448 mip->name = MemFree (mip->name);
1449 mip->name = StringSave ("Note-SubSrc");
1450 }
1451
1452 /* [orf], [rna], and [dna] don't have values */
1453 if (stop > eq_loc)
1454 {
1455 value_len = stop - eq_loc + 1;
1456 mip->value = (CharPtr) MemNew (value_len * sizeof (Char));
1457 if (mip->value == NULL)
1458 {
1459 mip = ModifierInfoFree (mip);
1460 return NULL;
1461 }
1462
1463 StringNCpy (mip->value, eq_loc + 1, value_len - 2);
1464 mip->value [value_len - 1] = 0;
1465 TrimSpacesAroundString (mip->value);
1466 }
1467
1468 mip->modtype = GetModifierType (mip->name);
1469 if (mip->modtype == eModifierType_SourceQual)
1470 {
1471 mip->subtype = FindTypeForModNameText (mip->name);
1472 }
1473 else
1474 {
1475 mip->subtype = 0;
1476 }
1477
1478 if (bracket_start != NULL)
1479 {
1480 *bracket_start = start;
1481 }
1482
1483 if (bracket_stop != NULL)
1484 {
1485 *bracket_stop = stop;
1486 }
1487
1488 return mip;
1489 }
1490
1491 static ValNodePtr ParseAllBracketedModifiers (CharPtr str)
1492 {
1493 CharPtr stop, cp;
1494 ValNodePtr list = NULL;
1495 ModifierInfoPtr mip;
1496
1497 cp = str;
1498 mip = ParseOneBracketedModifier (cp, NULL, &stop);
1499 while (mip != NULL && stop != NULL)
1500 {
1501 ValNodeAddPointer (&list, 0, mip);
1502 cp = stop + 1;
1503 mip = ParseOneBracketedModifier (cp, NULL, &stop);
1504 }
1505 return list;
1506 }
1507
1508 static Boolean IsValueInEnumAssoc (CharPtr value, EnumFieldAssocPtr eap)
1509 {
1510 while (eap != NULL && eap->name != NULL)
1511 {
1512 if (StringICmp (eap->name, value) == 0)
1513 {
1514 return TRUE;
1515 }
1516 eap++;
1517 }
1518 return FALSE;
1519 }
1520
1521 static Int4 GeneticCodeFromStringAndList (CharPtr str, ValNodePtr list)
1522 {
1523 while (list != NULL)
1524 {
1525 if (StringICmp (str, list->data.ptrvalue) == 0)
1526 {
1527 return list->choice;
1528 }
1529 list = list->next;
1530 }
1531 return 0;
1532 }
1533
1534 static Int4 GeneticCodeFromString (CharPtr str)
1535 {
1536 ValNodePtr gencodelist;
1537 Int4 gcode = 0;
1538
1539 if (StringHasNoText (str))
1540 {
1541 gcode = 0;
1542 }
1543 else if (isdigit (str[0]))
1544 {
1545 gcode = atoi (str);
1546 }
1547 else
1548 {
1549 gencodelist = GetGeneticCodeValNodeList ();
1550 gcode = GeneticCodeFromStringAndList (str, gencodelist);
1551 gencodelist = ValNodeFreeData (gencodelist);
1552 }
1553 return gcode;
1554 }
1555
1556 static Int4 MolTypeFromString (CharPtr str)
1557 {
1558 EnumFieldAssocPtr eap;
1559
1560 if (StringICmp (str, "dna") == 0)
1561 {
1562 return 253;
1563 }
1564 else if (StringICmp (str, "rna") == 0)
1565 {
1566 return 254;
1567 }
1568 else if (StringICmp (str, "genomic") == 0)
1569 {
1570 return 253;
1571 }
1572 for (eap = biomol_nucGen_alist; eap != NULL && eap->name != NULL; eap++)
1573 {
1574 if (StringsAreEquivalent (eap->name, str))
1575 {
1576 return eap->value;
1577 }
1578 }
1579 for (eap = biomol_nucX_alist; eap != NULL && eap->name != NULL; eap++)
1580 {
1581 if (StringsAreEquivalent (eap->name, str))
1582 {
1583 return eap->value;
1584 }
1585 else if (eap->name [0] == 'm'
1586 && StringICmp (eap->name, "mRNA [cDNA]") == 0
1587 && StringICmp (str, "mRNA") == 0)
1588 {
1589 return eap->value;
1590 }
1591 }
1592 return 0;
1593 }
1594
1595
1596 /* This function looks at a parsed modifier structure to determine whether the
1597 * value is acceptable for this modifier type.
1598 */
1599 static Boolean ModifierHasInvalidValue (ModifierInfoPtr mip)
1600 {
1601 Boolean rval = FALSE;
1602
1603 if (mip != NULL
1604 && ((mip->modtype == eModifierType_Location
1605 && !IsValueInEnumAssoc (mip->value, biosource_genome_simple_alist))
1606 || (mip->modtype == eModifierType_Origin
1607 && !IsValueInEnumAssoc (mip->value, biosource_origin_alist))
1608 || (mip->modtype == eModifierType_Topology
1609 && !IsValueInEnumAssoc (mip->value, topology_nuc_alist))
1610 || (mip->modtype == eModifierType_Molecule
1611 && !IsValueInEnumAssoc (mip->value, molecule_alist))
1612 || ((mip->modtype == eModifierType_GeneticCode
1613 || mip->modtype == eModifierType_NucGeneticCode
1614 || mip->modtype == eModifierType_MitoGeneticCode)
1615 && GeneticCodeFromString (mip->value) == 0)
1616 || (mip->modtype == eModifierType_MolType
1617 && MolTypeFromString (mip->value) == 0)
1618 || (mip->modtype == eModifierType_SourceQual
1619 && IsNonTextModifier (mip->name)
1620 && !StringHasNoText (mip->value)
1621 && StringICmp (mip->value, "TRUE") != 0
1622 && StringICmp (mip->value, "FALSE") != 0)))
1623 {
1624 rval = TRUE;
1625 }
1626
1627 return rval;
1628 }
1629
1630 /* This section contains functions for finding, changing, and removing
1631 * bracketed value pairs in definition lines.
1632 * These functions include:
1633 *
1634 * FindValuePairInDefLine - returns pointer to position in title where
1635 * the first bracketed pair with the specified
1636 * modifier name (or one of its aliases) occurs.
1637 * Useful for non-text modifiers, which do not
1638 * have values.
1639 *
1640 * FindValueFromPairInDefline - returns value from the first bracketed
1641 * pair in the title with the specified
1642 * modifier name (or one of its aliases).
1643 *
1644 * RemoveValueFromDefline - removes the first bracketed pair in the title
1645 * with the specified modifier name (or one of its aliases)
1646 *
1647 * ReplaceValueInThisValuePair - replaces the value in the specified value pair.
1648 * if new value is empty, pair is removed.
1649 *
1650 * ReplaceValueInOneDefLine - finds the first bracketed pair in the title
1651 * with the specified modifier name (or one of its aliases).
1652 * If a pair is found, the value in that pair is replaced
1653 * with the new value; otherwise a new pair is added to
1654 * the title.
1655 *
1656 * ReplaceOneModifierValue - finds all bracketed pairs in a title with the specified
1657 * modifier name or one of its aliases and the specified value
1658 * and replaces that value with the new value (or removes the
1659 * pair, if the new value is empty.
1660 *
1661 * RemoveAllDuplicatePairsFromOneTitle - removes all bracketed pairs that are duplicates
1662 * in name and value of another pair already in
1663 * the title.
1664 *
1665 * RemoveMeaninglessEmptyPairsFromOneTitle - removes bracketed pairs without values
1666 * that are not non-text modifiers
1667 *
1668 * StripAllInstancesOfModNameFromTitle - removes all mentions of specified modifier
1669 * name from title
1670 *
1671 */
1672
1673 static CharPtr FindValuePairInDefLine (CharPtr mod_name, CharPtr def_line, CharPtr PNTR valstop)
1674 {
1675 CharPtr cp, start, stop;
1676 ModifierInfoPtr mip;
1677 CharPtr canonical_name;
1678
1679 if (mod_name == NULL || def_line == NULL)
1680 {
1681 return NULL;
1682 }
1683
1684 cp = NextBracketToken (def_line);
1685 if (cp == NULL)
1686 {
1687 return NULL;
1688 }
1689
1690 canonical_name = GetCanonicalName (mod_name);
1691
1692 mip = ParseOneBracketedModifier (cp, &start, &stop);
1693 while (mip != NULL && start != NULL && stop != NULL
1694 && StringICmp (mip->name, canonical_name) != 0)
1695 {
1696 cp = NextBracketToken (stop + 1);
1697 mip = ModifierInfoFree (mip);
1698 mip = ParseOneBracketedModifier (cp, &start, &stop);
1699 }
1700
1701 if (mip != NULL && StringICmp (mip->name, canonical_name) == 0)
1702 {
1703 mip = ModifierInfoFree (mip);
1704 if (valstop != NULL)
1705 {
1706 *valstop = stop;
1707 }
1708 canonical_name = MemFree (canonical_name);
1709 return start;
1710 }
1711 else
1712 {
1713 mip = ModifierInfoFree (mip);
1714 canonical_name = MemFree (canonical_name);
1715 return NULL;
1716 }
1717 }
1718
1719 static CharPtr FindNthValuePairInDefLine (CharPtr title, CharPtr val_name, Int4 val_num, CharPtr PNTR p_val_end)
1720 {
1721 CharPtr val_loc, val_end = NULL;
1722 Int4 title_val_num;
1723
1724 if (StringHasNoText (val_name))
1725 {
1726 return NULL;
1727 }
1728
1729 val_loc = FindValuePairInDefLine (val_name, title, &val_end);
1730 title_val_num = 0;
1731 while (val_loc != NULL && val_end != NULL && title_val_num != val_num)
1732 {
1733 val_loc = FindValuePairInDefLine (val_name, val_end + 1, &val_end);
1734 title_val_num++;
1735 }
1736 if (p_val_end != NULL)
1737 {
1738 *p_val_end = val_end;
1739 }
1740 return val_loc;
1741 }
1742
1743 static CharPtr FindValueFromPairInDefline (CharPtr mod_name, CharPtr def_line)
1744 {
1745 CharPtr bracket_start, eq_loc, bracket_end;
1746 CharPtr new_val = NULL;
1747 Int4 new_val_len;
1748
1749 bracket_start = FindValuePairInDefLine (mod_name, def_line, &bracket_end);
1750 if (bracket_start == NULL || bracket_end == NULL)
1751 {
1752 return NULL;
1753 }
1754
1755 eq_loc = NextBracketToken (bracket_start + 1);
1756 if (eq_loc == NULL || *eq_loc != '=')
1757 {
1758 return NULL;
1759 }
1760
1761 new_val_len = bracket_end - eq_loc;
1762 new_val = (CharPtr) MemNew (new_val_len * sizeof (Char));
1763 if (new_val != NULL)
1764 {
1765 StringNCpy (new_val, eq_loc + 1, new_val_len - 1);
1766 new_val [new_val_len - 1] = 0;
1767 }
1768 TrimSpacesAroundString (new_val);
1769 return new_val;
1770 }
1771
1772 static CharPtr FindValueFromPairInDeflineBeforeCharPtr (CharPtr mod_name, CharPtr def_line, CharPtr cp)
1773 {
1774 CharPtr bracket_start, bracket_end;
1775
1776 bracket_start = FindValuePairInDefLine (mod_name, def_line, &bracket_end);
1777 if (bracket_start == NULL || (cp != NULL && bracket_start > cp))
1778 {
1779 return NULL;
1780 }
1781 else
1782 {
1783 return FindValueFromPairInDefline (mod_name, bracket_start);
1784 }
1785 }
1786
1787 static void RemoveValuePairFromDefline (CharPtr pair_start, CharPtr pair_end, CharPtr defline)
1788 {
1789 CharPtr src, dst;
1790
1791 if (pair_start == NULL || pair_end == NULL || defline == NULL
1792 || pair_end <= pair_start)
1793 {
1794 return;
1795 }
1796
1797 dst = pair_start;
1798 src = pair_end;
1799 while (isspace (*src))
1800 {
1801 src++;
1802 }
1803
1804 while (*src != 0)
1805 {
1806 *dst = *src;
1807 dst++;
1808 src++;
1809 }
1810 *dst = 0;
1811 }
1812
1813 static void RemoveValueFromDefline (CharPtr mod_name, CharPtr def_line)
1814 {
1815 CharPtr bracket_start, bracket_end;
1816
1817 bracket_start = FindValuePairInDefLine (mod_name, def_line, &bracket_end);
1818 if (bracket_start == NULL || bracket_end == NULL)
1819 {
1820 return;
1821 }
1822
1823 RemoveValuePairFromDefline (bracket_start, bracket_end + 1, def_line);
1824 }
1825
1826 static CharPtr AddQuotesToValueWithBrackets (CharPtr orig_value)
1827 {
1828 CharPtr first_bracket, first_quote;
1829 CharPtr cp, new_value = NULL, tmp_value;
1830 Char bracket_buf [2];
1831 Int4 offset;
1832
1833 if (orig_value == NULL)
1834 {
1835 return NULL;
1836 }
1837 else if (StringHasNoText (orig_value))
1838 {
1839 return StringSave (orig_value);
1840 }
1841
1842 new_value = StringSave (orig_value);
1843
1844 first_bracket = StringChr (new_value, '[');
1845 if (first_bracket == NULL)
1846 {
1847 first_bracket = StringChr (new_value, ']');
1848 }
1849
1850 first_quote = NextUnescapedQuote (new_value);
1851
1852 if (first_bracket == NULL && first_quote == NULL)
1853 {
1854 return new_value;
1855 }
1856 else if (first_bracket != NULL && first_quote == NULL)
1857 {
1858 tmp_value = (CharPtr) MemNew ((StringLen (new_value) + 3) * sizeof (Char));
1859 if (tmp_value == NULL)
1860 {
1861 new_value = MemFree (new_value);
1862 return NULL;
1863 }
1864 StringCat (tmp_value, "\"");
1865 StringCat (tmp_value, new_value);
1866 StringCat (tmp_value, "\"");
1867 new_value = MemFree (new_value);
1868 new_value = tmp_value;
1869 return new_value;
1870 }
1871
1872 cp = orig_value;
1873
1874 bracket_buf [0] = 0;
1875 bracket_buf [1] = 0;
1876
1877 while (*cp != 0)
1878 {
1879 if (*cp == '"' && (cp == orig_value || *(cp - 1) != '\\'))
1880 {
1881 cp = NextUnescapedQuote (cp + 1);
1882 if (cp == NULL)
1883 {
1884 tmp_value = (CharPtr) MemNew ((StringLen (new_value) + 3) * sizeof (Char));
1885 if (tmp_value == NULL)
1886 {
1887 new_value = MemFree (new_value);
1888 return NULL;
1889 }
1890 StringCpy (tmp_value, new_value);
1891 if (new_value [StringLen (new_value) - 1] == '\\')
1892 {
1893 StringCat (tmp_value, " ");
1894 }
1895 StringCat (tmp_value, "\"");
1896 return tmp_value;
1897 }
1898 else
1899 {
1900 cp++;
1901 }
1902 }
1903 else if (*cp == '[' || *cp == ']')
1904 {
1905 tmp_value = (CharPtr) MemNew ((StringLen (new_value) + 3) * sizeof (Char));
1906 if (tmp_value == NULL)
1907 {
1908 new_value = MemFree (new_value);
1909 return new_value;
1910 }
1911 offset = cp - new_value;
1912 StringNCpy (tmp_value, new_value, offset);
1913 StringCat (tmp_value, "\"");
1914 bracket_buf [0] = *cp;
1915 StringCat (tmp_value, bracket_buf);
1916 StringCat (tmp_value, "\"");
1917 StringCat (tmp_value, cp + 1);
1918 new_value = MemFree (new_value);
1919 new_value = tmp_value;
1920 cp = new_value + offset + 3;
1921 }
1922 else
1923 {
1924 cp++;
1925 }
1926 }
1927
1928 return new_value;
1929 }
1930
1931 static CharPtr
1932 ReplaceValueInThisValuePair
1933 (CharPtr orig_defline,
1934 CharPtr value_loc,
1935 CharPtr value_name,
1936 CharPtr end_loc,
1937 CharPtr new_value)
1938 {
1939 CharPtr new_title;
1940 Int4 new_title_len = 0;
1941 Boolean is_nontext;
1942 CharPtr tmp_name;
1943 CharPtr fixed_value;
1944
1945 if (StringHasNoText (orig_defline) || value_loc == NULL || end_loc == NULL
1946 || *value_loc != '[' || *end_loc != ']')
1947 {
1948 return orig_defline;
1949 }
1950
1951 fixed_value = AddQuotesToValueWithBrackets (new_value);
1952
1953 if (StringHasNoText (fixed_value))
1954 {
1955 RemoveValuePairFromDefline (value_loc, end_loc, orig_defline);
1956 }
1957 else
1958 {
1959 /* keep part before pair and after pair, insert new value in position */
1960 new_title_len = StringLen (orig_defline)
1961 + StringLen (value_name)
1962 + StringLen (fixed_value)
1963 + 5;
1964 new_title = MemNew (new_title_len * sizeof (Char));
1965 if (new_title != NULL)
1966 {
1967 if (value_loc > orig_defline)
1968 {
1969 StringNCpy (new_title, orig_defline, value_loc - orig_defline);
1970 }
1971 StringCat (new_title, "[");
1972 tmp_name = StringSave (value_name);
1973 tmp_name [0] = TO_LOWER (tmp_name [0]);
1974 StringCat (new_title, tmp_name);
1975 is_nontext = IsNonTextModifier (tmp_name);
1976 tmp_name = MemFree (tmp_name);
1977 StringCat (new_title, "=");
1978 if (!is_nontext)
1979 {
1980 StringCat (new_title, fixed_value);
1981 }
1982 StringCat (new_title, "]");
1983 if (end_loc != NULL && *end_loc != 0)
1984 {
1985 if (*end_loc == ']')
1986 {
1987 StringCat (new_title, end_loc + 1);
1988 }
1989 else
1990 {
1991 StringCat (new_title, end_loc);
1992 }
1993 }
1994 orig_defline = MemFree (orig_defline);
1995 orig_defline = new_title;
1996 }
1997 }
1998 TrimSpacesAroundString (orig_defline);
1999
2000 fixed_value = MemFree (fixed_value);
2001
2002 return orig_defline;
2003 }
2004
2005 static CharPtr InsertStringAtOffset (CharPtr old_string, CharPtr new_string, Int4 offset)
2006 {
2007 Int4 new_len;
2008 CharPtr new_str = NULL;
2009
2010 if (old_string == NULL)
2011 {
2012 new_str = StringSave (new_string);
2013 }
2014 else if (new_string == NULL)
2015 {
2016 new_str = StringSave (old_string);
2017 }
2018 else
2019 {
2020 new_len = StringLen (old_string) + StringLen (new_string) + 1;
2021 new_str = (CharPtr) MemNew (new_len * sizeof (Char));
2022 if (new_str != NULL)
2023 {
2024 StringNCpy (new_str, old_string, offset);
2025 StringCat (new_str, new_string);
2026 if ((Uint4)offset < StringLen (old_string))
2027 {
2028 StringCat (new_str, old_string + offset);
2029 }
2030 }
2031 }
2032 return new_str;
2033 }
2034
2035 static CharPtr
2036 InsertValuePairAtOffset
2037 (CharPtr orig_defline,
2038 CharPtr value_name,
2039 CharPtr value_str,
2040 Int4 offset)
2041 {
2042 CharPtr pair_string, fixed_value;
2043
2044 if (StringHasNoText (value_name) || offset < 0)
2045 {
2046 return orig_defline;
2047 }
2048
2049 fixed_value = AddQuotesToValueWithBrackets (value_str);
2050
2051 pair_string = (CharPtr) MemNew ((StringLen (value_name) + StringLen (fixed_value) + 6) * sizeof (Char));
2052 if (pair_string != NULL)
2053 {
2054 if (IsNonTextModifier (value_name))
2055 {
2056 sprintf (pair_string, "[%s=]", value_name);
2057 }
2058 else
2059 {
2060 sprintf (pair_string, "[%s=%s]", value_name, fixed_value);
2061 }
2062 orig_defline = InsertStringAtOffset (orig_defline, pair_string, offset);
2063 pair_string = MemFree (pair_string);
2064 }
2065 fixed_value = MemFree (fixed_value);
2066 return orig_defline;
2067 }
2068
2069
2070 static CharPtr
2071 ReplaceValueInOneDefLineForOrganism
2072 (CharPtr orig_defline,
2073 CharPtr value_name,
2074 CharPtr new_value,
2075 CharPtr organism)
2076 {
2077 CharPtr value_loc = NULL, end_loc = NULL;
2078 CharPtr fixed_value;
2079 CharPtr next_org_loc = NULL, org_stop = NULL, first_org_stop = NULL;
2080 CharPtr first_organism;
2081
2082 if (StringHasNoText (value_name))
2083 {
2084 return orig_defline;
2085 }
2086
2087 /* if we want to add a value to a specific organism, we need to make sure
2088 * that we insert or replace a value after that organism name but before
2089 * the next organism name.
2090 */
2091
2092 if (organism != NULL)
2093 {
2094 if (organism < orig_defline || organism - orig_defline > (Int4) StringLen (orig_defline))
2095 {
2096 organism = NULL;
2097 }
2098 }
2099
2100 if (organism != NULL)
2101 {
2102 if (organism != FindValuePairInDefLine ("organism", organism, &org_stop))
2103 {
2104 return orig_defline;
2105 }
2106 }
2107
2108 first_organism = FindValuePairInDefLine ("organism", orig_defline, &first_org_stop);
2109
2110
2111 if (organism == NULL)
2112 {
2113 organism = first_organism;
2114 org_stop = first_org_stop;
2115 }
2116
2117 if (org_stop != NULL)
2118 {
2119 next_org_loc = FindValuePairInDefLine ("organism", org_stop + 1, NULL);
2120 }
2121
2122 fixed_value = AddQuotesToValueWithBrackets (new_value);
2123
2124 /* if this is the first organism, or if we have no organism, start looking for
2125 * a value to replace at the beginning of the line.
2126 */
2127 if (organism == NULL || organism == first_organism)
2128 {
2129 value_loc = FindValuePairInDefLine (value_name, orig_defline, &end_loc);
2130 }
2131 else
2132 {
2133 value_loc = FindValuePairInDefLine (value_name, organism, &end_loc);
2134 }
2135
2136 if (next_org_loc != NULL && value_loc > next_org_loc)
2137 {
2138 value_loc = NULL;
2139 }
2140
2141 if (StringHasNoText (fixed_value))
2142 {
2143 if (value_loc == NULL)
2144 {
2145 /* old line had no value, no new value provided, no change */
2146 }
2147 else
2148 {
2149 RemoveValuePairFromDefline (value_loc, end_loc, orig_defline);
2150 }
2151 }
2152 else
2153 {
2154 if (value_loc == NULL)
2155 {
2156 /* add new value just before next organism */
2157 if (next_org_loc == NULL)
2158 {
2159 orig_defline = InsertValuePairAtOffset (orig_defline, value_name, new_value,
2160 StringLen (orig_defline));
2161 }
2162 else
2163 {
2164 orig_defline = InsertValuePairAtOffset (orig_defline, value_name, new_value,
2165 next_org_loc - orig_defline);
2166 }
2167 }
2168 else
2169 {
2170 /* replace this value */
2171 orig_defline = ReplaceValueInThisValuePair (orig_defline, value_loc, value_name,
2172 end_loc, new_value);
2173 }
2174 }
2175 TrimSpacesAroundString (orig_defline);
2176
2177 fixed_value = MemFree (fixed_value);
2178
2179 return orig_defline;
2180 }
2181
2182 static CharPtr
2183 ReplaceValueInOneDefLine
2184 (CharPtr orig_defline,
2185 CharPtr value_name,
2186 CharPtr new_value)
2187 {
2188 CharPtr value_loc = NULL, end_loc = NULL;
2189
2190 if (StringHasNoText (value_name))
2191 {
2192 return orig_defline;
2193 }
2194
2195 value_loc = FindValuePairInDefLine (value_name, orig_defline, &end_loc);
2196
2197 if (value_loc == NULL)
2198 {
2199 if (StringHasNoText (new_value))
2200 {
2201 /* old line had no value, no new value provided, no change */
2202 return orig_defline;
2203 }
2204 else
2205 {
2206 /* make sure value is added for first organism */
2207 orig_defline = ReplaceValueInOneDefLineForOrganism (orig_defline, value_name,
2208 new_value, NULL);
2209 }
2210 }
2211 else
2212 {
2213 orig_defline = ReplaceValueInThisValuePair (orig_defline, value_loc, value_name, end_loc, new_value);
2214 }
2215
2216 return orig_defline;
2217 }
2218
2219 static CharPtr
2220 ReplaceOneModifierValue
2221 (CharPtr title,
2222 CharPtr orig_name,
2223 CharPtr orig_value,
2224 CharPtr repl_value,
2225 Boolean is_nontext,
2226 Boolean copy_to_note)
2227 {
2228 CharPtr bracket_loc, eq_loc, end_bracket_loc, new_title;
2229 Int4 new_title_len;
2230 CharPtr orig_note, new_note;
2231 Boolean any_replaced = FALSE;
2232
2233 if (StringHasNoText (title)
2234 || StringHasNoText (orig_name))
2235 {
2236 return title;
2237 }
2238
2239 bracket_loc = FindValuePairInDefLine (orig_name, title, &end_bracket_loc);
2240 while (bracket_loc != NULL && end_bracket_loc != NULL)
2241 {
2242 eq_loc = NextBracketToken (bracket_loc + 1);
2243 if (eq_loc == NULL || *eq_loc != '=')
2244 {
2245 return title;
2246 }
2247 if ((StringNCmp (orig_value, eq_loc + 1, StringLen (orig_value)) == 0
2248 && StringLen (orig_value) == end_bracket_loc - eq_loc - 1)
2249 || (StringHasNoText (orig_value)
2250 && StringSpn (eq_loc + 1, " \t") == end_bracket_loc - eq_loc - 1))
2251 {
2252 new_title_len = StringLen (title) + StringLen (repl_value) - StringLen (orig_value) + 1;
2253 new_title = (CharPtr) MemNew (new_title_len * sizeof (Char));
2254 if (new_title == NULL)
2255 {
2256 return title;
2257 }
2258 if (is_nontext)
2259 {
2260 if (StringHasNoText (repl_value))
2261 {
2262 StringNCpy (new_title, title, bracket_loc - title);
2263 StringCat (new_title, end_bracket_loc + 1 + StringSpn (end_bracket_loc, " "));
2264 }
2265 else
2266 {
2267 StringNCpy (new_title, title, eq_loc - title + 1);
2268 StringCat (new_title, end_bracket_loc);
2269 }
2270 }
2271 else if (StringHasNoText (repl_value))
2272 {
2273 /* remove pair completely */
2274 StringNCpy (new_title, title, bracket_loc - title);
2275 StringCat (new_title, end_bracket_loc + 1);
2276 }
2277 else
2278 {
2279 StringNCpy (new_title, title, eq_loc - title + 1);
2280 StringCat (new_title, repl_value);
2281 StringCat (new_title, end_bracket_loc);
2282 }
2283
2284 title = MemFree (title);
2285 title = new_title;
2286 any_replaced = TRUE;
2287 bracket_loc = FindValuePairInDefLine (orig_name, title, &end_bracket_loc);
2288 }
2289 else
2290 {
2291 bracket_loc = FindValuePairInDefLine (orig_name, end_bracket_loc, &end_bracket_loc);
2292 }
2293 }
2294
2295 if (any_replaced && copy_to_note && !StringHasNoText (repl_value) && !StringHasNoText (orig_value))
2296 {
2297 orig_note = FindValueFromPairInDefline ("note", title);
2298 if (StringHasNoText (orig_note))
2299 {
2300 new_note = (CharPtr) MemNew ((StringLen (orig_name)
2301 + StringLen (orig_value) + 8) * sizeof (Char));
2302 if (new_note != NULL)
2303 {
2304 sprintf (new_note, "%s was %s", orig_name, orig_value);
2305 }
2306 }
2307 else
2308 {
2309 new_note = (CharPtr) MemNew ((StringLen (orig_note)
2310 + StringLen (orig_name)
2311 + StringLen (orig_value) + 8) * sizeof (Char));
2312 if (new_note != NULL)
2313 {
2314 sprintf (new_note, "%s; %s was %s", orig_note, orig_name, orig_value);
2315 }
2316 }
2317
2318 if (new_note != NULL)
2319 {
2320 title = ReplaceValueInOneDefLine (title, "note", new_note);
2321 }
2322
2323 orig_note = MemFree (orig_note);
2324 new_note = MemFree (new_note);
2325 }
2326
2327 return title;
2328 }
2329
2330 static Boolean IsUnrecognizedModifierName (ModifierInfoPtr mip, Boolean is_nuc);
2331
2332 static void RemoveRecognizedModifiersFromTitle (CharPtr title, ValNodePtr modifier_info_list, Boolean is_nuc)
2333 {
2334 ValNodePtr vnp;
2335 ModifierInfoPtr mip;
2336
2337 for (vnp = modifier_info_list; vnp != NULL; vnp = vnp->next) {
2338 mip = (ModifierInfoPtr) vnp->data.ptrvalue;
2339 if (mip != NULL && ! IsUnrecognizedModifierName (mip, is_nuc)
2340 && (!is_nuc || mip->modtype != eModifierType_Protein)) {
2341 RemoveValueFromDefline (mip->name, title);
2342 }
2343 }
2344 }
2345
2346 static void StripAllInstancesOfModNameFromTitle (CharPtr mod_name, CharPtr title)
2347 {
2348 CharPtr valstr;
2349
2350 valstr = FindValueFromPairInDefline (mod_name, title);
2351 while (valstr != NULL)
2352 {
2353 RemoveValueFromDefline (mod_name, title);
2354 valstr = MemFree (valstr);
2355 valstr = FindValueFromPairInDefline (mod_name, title);
2356 }
2357 }
2358
2359 static CharPtr RemoveAllDuplicatePairsFromOneTitle (CharPtr title)
2360 {
2361 CharPtr start_bracket, end_bracket, tmp_title, new_title;
2362 ModifierInfoPtr mip;
2363 Int4 offset;
2364
2365 mip = ParseOneBracketedModifier (title, &start_bracket, &end_bracket);
2366 while (mip != NULL && start_bracket != NULL && end_bracket != NULL)
2367 {
2368 offset = end_bracket - title + 1;
2369 tmp_title = StringSave (title + offset);
2370 tmp_title = ReplaceOneModifierValue (tmp_title, mip->name, mip->value, NULL,
2371 IsNonTextModifier (mip->name), FALSE);
2372 new_title = (CharPtr) MemNew ((StringLen (tmp_title) + offset + 1)* sizeof (Char));
2373 if (new_title != NULL)
2374 {
2375 StringNCpy (new_title, title, offset);
2376 StringCat (new_title, tmp_title);
2377 }
2378 tmp_title = MemFree (tmp_title);
2379 title = MemFree (title);
2380 title = new_title;
2381 mip = ModifierInfoFree (mip);
2382 mip = ParseOneBracketedModifier (title + offset, &start_bracket, &end_bracket);
2383 }
2384 mip = ModifierInfoFree (mip);
2385 return title;
2386 }
2387
2388 static void ShiftString (CharPtr str, Int4 shift_size)
2389 {
2390 CharPtr src, dst;
2391
2392 if (str == NULL)
2393 {
2394 return;
2395 }
2396
2397 if (shift_size > (Int4) StringLen (str))
2398 {
2399 *str = 0;
2400 }
2401 else
2402 {
2403 src = str + shift_size;
2404 dst = str;
2405 while (*src != 0)
2406 {
2407 *dst = *src;
2408 dst++;
2409 src++;
2410 }
2411 *dst = 0;
2412 }
2413 }
2414
2415 static void RemoveMeaninglessEmptyPairsFromOneTitle (CharPtr title)
2416 {
2417 CharPtr start_bracket, end_bracket;
2418 ModifierInfoPtr mip;
2419
2420 mip = ParseOneBracketedModifier (title, &start_bracket, &end_bracket);
2421 while (mip != NULL && start_bracket != NULL && end_bracket != NULL)
2422 {
2423 if (StringHasNoText (mip->value) && ! IsNonTextModifier (mip->name))
2424 {
2425 ShiftString (start_bracket, end_bracket - start_bracket + 1);
2426 mip = ModifierInfoFree (mip);
2427 mip = ParseOneBracketedModifier (start_bracket, &start_bracket, &end_bracket);
2428 }
2429 else
2430 {
2431 mip = ModifierInfoFree (mip);
2432 mip = ParseOneBracketedModifier (end_bracket + 1, &start_bracket, &end_bracket);
2433 }
2434 }
2435 mip = ModifierInfoFree (mip);
2436 }
2437
2438 static void ApplyOneModToSeqEntry (SeqEntryPtr sep, CharPtr mod_name, CharPtr mod_value)
2439 {
2440 BioseqPtr bsp = NULL;
2441 SeqDescrPtr sdp = NULL;
2442
2443 if (sep == NULL || StringHasNoText (mod_name))
2444 {
2445 return;
2446 }
2447
2448 if (IS_Bioseq (sep))
2449 {
2450 bsp = (BioseqPtr) sep->data.ptrvalue;
2451 }
2452 else if (IS_Bioseq_set (sep))
2453 {
2454 sep = FindNucSeqEntry (sep);
2455 if (sep != NULL && IS_Bioseq (sep))
2456 {
2457 bsp = (BioseqPtr) sep->data.ptrvalue;
2458 }
2459 }
2460
2461 if (bsp == NULL)
2462 {
2463 return;
2464 }
2465
2466 for (sdp = bsp->descr; sdp != NULL && sdp->choice != Seq_descr_title; sdp = sdp->next)
2467 {
2468 }
2469
2470 if (sdp == NULL)
2471 {
2472 sdp = SeqDescrNew (NULL);
2473 sdp->choice = Seq_descr_title;
2474 if (bsp->descr == NULL)
2475 {
2476 bsp->descr = sdp;
2477 }
2478 }
2479 if (sdp != NULL)
2480 {
2481 sdp->data.ptrvalue = ReplaceValueInOneDefLine (sdp->data.ptrvalue,
2482 mod_name, mod_value);
2483 }
2484
2485
2486 }
2487
2488 static ModifierInfoPtr MakeModifierInfoFromNameAndValue (CharPtr value_name, CharPtr value_string)
2489 {
2490 ModifierInfoPtr mip;
2491 CharPtr tmp_pair;
2492 CharPtr fixed_value;
2493
2494 fixed_value = AddQuotesToValueWithBrackets (value_string);
2495 tmp_pair = (CharPtr) MemNew ((StringLen (value_name) + StringLen (fixed_value) + 4));
2496 if (tmp_pair == NULL)
2497 {
2498 return NULL;
2499 }
2500 sprintf (tmp_pair, "[%s=%s]", value_name == NULL ? "" : value_name,
2501 fixed_value == NULL ? "" : fixed_value);
2502 mip = ParseOneBracketedModifier (tmp_pair, NULL, NULL);
2503 tmp_pair = MemFree (tmp_pair);
2504 fixed_value = MemFree (fixed_value);
2505 return mip;
2506 }
2507
2508 /* This section is used to import tables of modifiers. */
2509 static CharPtr
2510 ApplyImportModToTitle
2511 (CharPtr title,
2512 CharPtr value_name,
2513 CharPtr value_string,
2514 Boolean erase_where_blank,
2515 Boolean parse_multiple)
2516 {
2517 ModifierInfoPtr mip;
2518 CharPtr next_semi, val_start, title_loc, title_end;
2519 CharPtr insert_point;
2520 Int4 insert_offset, title_val_num;
2521 Char val_save_ch;
2522
2523 if (StringHasNoText (value_name))
2524 {
2525 return title;
2526 }
2527
2528 if (!erase_where_blank && StringHasNoText (value_string))
2529 {
2530 return title;
2531 }
2532
2533 mip = MakeModifierInfoFromNameAndValue (value_name, value_string);
2534
2535 if (mip == NULL
2536 || (mip->modtype == eModifierType_SourceQual
2537 && mip->subtype == 255
2538 && StringICmp (mip->name, "note-subsrc") != 0
2539 && StringICmp (mip->name, "note-orgmod") != 0))
2540 {
2541 mip = ModifierInfoFree (mip);
2542 return title;
2543 }
2544
2545 if (erase_where_blank && StringHasNoText (value_string))
2546 {
2547 RemoveValueFromDefline (value_name, title);
2548 }
2549 else if (parse_multiple
2550 && value_string [0] == '('
2551 && value_string [StringLen (value_string) - 1] == ')'
2552 && (next_semi = StringChr (value_string, ';')) != NULL)
2553 {
2554 val_start = value_string + 1;
2555 title_val_num = 0;
2556 while (next_semi != NULL)
2557 {
2558 /* temporarily truncate at end of value */
2559 val_save_ch = *next_semi;
2560 *next_semi = 0;
2561
2562 title_loc = FindNthValuePairInDefLine (title, value_name, title_val_num, &title_end);
2563 if (StringHasNoText (val_start))
2564 {
2565 if (title_loc != NULL)
2566 {
2567 RemoveValuePairFromDefline (title_loc, title_end, title);
2568 }
2569 else
2570 {
2571 /* if text is empty and there is no value pair, nothing to do */
2572 }
2573 /* note - we do not increment title_val_num here because either we've
2574 * removed a value or there are no values left.
2575 */
2576 }
2577 else
2578 {
2579 if (title_loc == NULL)
2580 {
2581 /* need to insert a new value - if organism name, put at end of title,
2582 * otherwise insert before second organism name if any
2583 */
2584 if (StringICmp (value_name, "organism") == 0)
2585 {
2586 insert_offset = StringLen (title);
2587 }
2588 else
2589 {
2590 insert_point = FindNthValuePairInDefLine (title, "organism", 1, NULL);
2591 if (insert_point == NULL)
2592 {
2593 insert_offset = StringLen (title);
2594 }
2595 else
2596 {
2597 insert_offset = insert_point - title;
2598 }
2599 }
2600 title = InsertValuePairAtOffset (title, value_name, val_start, insert_offset);
2601 }
2602 else
2603 {
2604 /* replace values in order */
2605 title = ReplaceValueInThisValuePair (title, title_loc, value_name,
2606 title_end, val_start);
2607 }
2608
2609 title_val_num++;
2610 }
2611
2612 /* replace character */
2613 *next_semi = val_save_ch;
2614 /* advance to next value in list */
2615 val_start = next_semi + 1;
2616 if (*next_semi == ';')
2617 {
2618 next_semi = StringChr (next_semi + 1, ';');
2619 if (next_semi == NULL)
2620 {
2621 next_semi = value_string + StringLen (value_string) - 1;
2622 }
2623 }
2624 else
2625 {
2626 next_semi = NULL;
2627 }
2628 }
2629 }
2630 else if (StringCmp (value_name, "organism") == 0)
2631 {
2632 title = ReplaceValueInOneDefLine (title, value_name, value_string);
2633 }
2634 else
2635 {
2636 title = ReplaceValueInOneDefLineForOrganism (title, value_name, value_string, NULL);
2637 }
2638
2639 mip = ModifierInfoFree (mip);
2640 return title;
2641 }
2642
2643 static ValNodePtr ReadRowListFromFile (void)
2644 {
2645 Char path [PATH_MAX];
2646 ValNodePtr header_line = NULL;
2647 FILE *fp;
2648
2649 path [0] = '\0';
2650 if (! GetInputFileName (path, sizeof (path), NULL, "TEXT")) return NULL;
2651 fp = FileOpen (path, "r");
2652 if (fp == NULL) {
2653 Message (MSG_ERROR, "Unable to open %s", path);
2654 } else {
2655 header_line = ReadTabTableFromFile (fp);
2656 FileClose (fp);
2657 }
2658 return header_line;
2659 }
2660
2661 /* This function will find the sequence number in the IDAndTitleEdit
2662 * to use for each row and put that value in the sequence_numbers array.
2663 */
2664 static Boolean
2665 ValidateModifierTableSequenceIDs
2666 (ValNodePtr header_line,
2667 IDAndTitleEditPtr iatep,
2668 Int4Ptr sequence_numbers,
2669 Int4Ptr num_rows)
2670 {
2671 ValNodePtr not_found = NULL;
2672 ValNodePtr found_more_than_once = NULL;
2673 CharPtr too_many_msg = NULL, not_found_msg = NULL;
2674 Boolean rval = TRUE;
2675 Int4 msg_len = 0;
2676 CharPtr too_many_fmt = " found more than once\n";
2677 CharPtr not_found_fmt = " not found\n";
2678 CharPtr err_msg = NULL;
2679 ValNodePtr row_vnp, col_vnp, prev_row, next_row;
2680 Int4 i, seq_num, other_instances;
2681 Boolean found;
2682 Int4 row_number;
2683 Int4 deleted_rows;
2684
2685 if (header_line == NULL || header_line->next == NULL || iatep == NULL
2686 || sequence_numbers == NULL || num_rows == NULL || *num_rows < ValNodeLen (header_line->next))
2687 {
2688 return FALSE;
2689 }
2690
2691 for (row_vnp = header_line->next, row_number = 0;
2692 row_vnp != NULL && row_number < *num_rows;
2693 row_vnp = row_vnp->next, row_number++)
2694 {
2695 col_vnp = row_vnp->data.ptrvalue;
2696 if (col_vnp == NULL || col_vnp->data.ptrvalue == NULL)
2697 {
2698 continue;
2699 }
2700
2701 /* find correct sequence number */
2702 seq_num = -1;
2703 for (i = 0, found = FALSE; i < iatep->num_sequences && !found; i++)
2704 {
2705 if (StringCmp (iatep->id_list [i], col_vnp->data.ptrvalue) == 0)
2706 {
2707 seq_num = i;
2708 found = TRUE;
2709 }
2710 }
2711 sequence_numbers[row_number] = seq_num;
2712
2713 if (!found)
2714 {
2715 ValNodeAddPointer (¬_found, 0, StringSave (col_vnp->data.ptrvalue));
2716 }
2717 else
2718 {
2719 /* count the number of times this seq_num has already appeared in the list.*/
2720 other_instances = 0;
2721 for (prev_row = header_line->next; prev_row != row_vnp; prev_row = prev_row->next)
2722 {
2723 if (prev_row->choice == seq_num)
2724 {
2725 other_instances++;
2726 }
2727 }
2728 /* if the value was found exactly once, add this to the list of duplicates.
2729 * if the value was found more than once, it will already have been reported.
2730 */
2731 if (other_instances == 1)
2732 {
2733 ValNodeAddPointer (&found_more_than_once, 0, StringSave (col_vnp->data.ptrvalue));
2734 }
2735 }
2736 }
2737
2738 if (found_more_than_once != NULL || not_found != NULL)
2739 {
2740 if (found_more_than_once != NULL)
2741 {
2742 too_many_msg = CreateListMessage ("Sequence ID", NULL, found_more_than_once);
2743 rval = FALSE;
2744 msg_len += StringLen (too_many_msg) + StringLen (too_many_fmt) + 5;
2745 }
2746 if (not_found != NULL)
2747 {
2748 not_found_msg = CreateListMessage ("Sequence ID", NULL, not_found);
2749 msg_len += StringLen (not_found_msg) + StringLen (not_found_fmt) + 5;
2750 }
2751
2752 err_msg = (CharPtr) MemNew ((msg_len + 1) * sizeof (Char));
2753 if (err_msg != NULL)
2754 {
2755 if (too_many_msg != NULL)
2756 {
2757 StringCat (err_msg, too_many_msg);
2758 if (found_more_than_once->next != NULL)
2759 {
2760 StringCat (err_msg, " were");
2761 }
2762 else
2763 {
2764 StringCat (err_msg, " was");
2765 }
2766 StringCat (err_msg, too_many_fmt);
2767 }
2768 if (not_found_msg != NULL)
2769 {
2770 StringCat (err_msg, not_found_msg);
2771 if (not_found->next != NULL)
2772 {
2773 StringCat (err_msg, " were");
2774 }
2775 else
2776 {
2777 StringCat (err_msg, " was");
2778 }
2779 StringCat (err_msg, not_found_fmt);
2780 }
2781 if (rval)
2782 {
2783 if (ANS_NO == Message (MSG_YN, "%sContinue anyway?", err_msg))
2784 {
2785 rval = FALSE;
2786 }
2787 }
2788 else
2789 {
2790 Message (MSG_ERROR, "%sPlease correct your file.", err_msg);
2791 }
2792 }
2793 too_many_msg = MemFree (too_many_msg);
2794 not_found_msg = MemFree (not_found_msg);
2795 err_msg = MemFree (err_msg);
2796 }
2797
2798 /* remove rows for sequence IDs that are not found */
2799 for (row_vnp = header_line->next, row_number = 0, prev_row = header_line;
2800 row_vnp != NULL && row_number < *num_rows;
2801 row_vnp = next_row, row_number++)
2802 {
2803 next_row = row_vnp->next;
2804 if (sequence_numbers[row_number] < 0) {
2805 prev_row->next = next_row;
2806 row_vnp->next = NULL;
2807 row_vnp = FreeTableDisplayRowList (row_vnp);
2808 } else {
2809 prev_row = row_vnp;
2810 }
2811 }
2812
2813 /* now remove sequence_numbers entries */
2814 deleted_rows = 0;
2815 row_number = 0;
2816 while (row_number < *num_rows)
2817 {
2818 if (sequence_numbers[row_number] < 0) {
2819 for (i = row_number + 1; i < *num_rows; i++) {
2820 sequence_numbers[i - 1] = sequence_numbers[i];
2821 }
2822 (*num_rows)--;
2823 } else {
2824 row_number++;
2825 }
2826 }
2827
2828 return rval;
2829 }
2830
2831 /* This checks the column names and puts the modifier type in the choice for each column */
2832 static Boolean ValidateImportModifierColumnNames (ValNodePtr header_line)
2833 {
2834 ValNodePtr header_vnp;
2835 Boolean rval = TRUE;
2836 ModifierInfoPtr mip;
2837 CharPtr orig_name;
2838 Int4 col_num;
2839
2840 if (header_line == NULL)
2841 {
2842 return FALSE;
2843 }
2844
2845 header_vnp = header_line->data.ptrvalue;
2846 if (header_vnp == NULL || header_vnp->next == NULL)
2847 {
2848 return FALSE;
2849 }
2850
2851 /* check ID column */
2852 if (StringICmp (header_vnp->data.ptrvalue, "local_id") != 0
2853 && StringICmp (header_vnp->data.ptrvalue, "local id") != 0
2854 && StringICmp (header_vnp->data.ptrvalue, "local-id") != 0
2855 && StringICmp (header_vnp->data.ptrvalue, "seq_id") != 0
2856 && StringICmp (header_vnp->data.ptrvalue, "seq id") != 0
2857 && StringICmp (header_vnp->data.ptrvalue, "seq-id") != 0
2858 && StringICmp (header_vnp->data.ptrvalue, "seqid") != 0
2859 && StringICmp (header_vnp->data.ptrvalue, "sequence_id") != 0
2860 && StringICmp (header_vnp->data.ptrvalue, "sequence id") != 0
2861 && StringICmp (header_vnp->data.ptrvalue, "sequence-id") != 0
2862 )
2863 {
2864 Message (MSG_ERROR, "Table file is missing header line! Make sure first column header is seq_id");
2865 return FALSE;
2866 }
2867 header_vnp = header_vnp->next;
2868 col_num = 1;
2869 while (header_vnp != NULL && rval)
2870 {
2871 mip = MakeModifierInfoFromNameAndValue (header_vnp->data.ptrvalue, NULL);
2872 if (mip == NULL
2873 || (mip->modtype == eModifierType_SourceQual
2874 && mip->subtype == 255
2875 && StringICmp (mip->name, "note-subsrc") != 0
2876 && StringICmp (mip->name, "note-orgmod") != 0))
2877 {
2878 orig_name = (CharPtr) header_vnp->data.ptrvalue;
2879 rval = ReplaceImportModifierName (&orig_name, col_num);
2880 header_vnp->data.ptrvalue = orig_name;
2881 }
2882 else
2883 {
2884 header_vnp->data.ptrvalue = MemFree (header_vnp->data.ptrvalue);
2885 header_vnp->data.ptrvalue = StringSave (mip->name);
2886 header_vnp->choice = mip->modtype;
2887 }
2888 mip = ModifierInfoFree (mip);
2889 header_vnp = header_vnp->next;
2890 col_num++;
2891 }
2892 return rval;
2893 }
2894
2895 static Boolean StringAlreadyInList (ValNodePtr list, CharPtr str)
2896 {
2897 while (list != NULL)
2898 {
2899 if (StringICmp (list->data.ptrvalue, str) == 0)
2900 {
2901 return TRUE;
2902 }
2903 list = list->next;
2904 }
2905 return FALSE;
2906 }
2907
2908 static Boolean ValidateTableValues (ValNodePtr header_line)
2909 {
2910 ValNodePtr header_vnp, row_vnp, col_vnp;
2911 Boolean rval = TRUE;
2912 ModifierInfoPtr mip;
2913 Int4 col_num;
2914 ValNodePtr bad_value_columns = NULL;
2915 ValNodePtr bad_nontext_columns = NULL;
2916 CharPtr err_msg;
2917
2918 if (header_line == NULL || header_line->next == NULL
2919 || header_line->data.ptrvalue == NULL)
2920 {
2921 return FALSE;
2922 }
2923
2924 for (row_vnp = header_line->next; row_vnp != NULL; row_vnp = row_vnp->next)
2925 {
2926 /* skip rows with bad sequence IDs */
2927 if (row_vnp->data.ptrvalue == NULL)
2928 {
2929 continue;
2930 }
2931
2932 header_vnp = header_line->data.ptrvalue;
2933 col_vnp = row_vnp->data.ptrvalue;
2934 /* skip ID column */
2935 header_vnp = header_vnp->next;
2936 col_vnp = col_vnp->next;
2937 for (col_num = 1;
2938 header_vnp != NULL && col_vnp != NULL;
2939 header_vnp = header_vnp->next, col_vnp = col_vnp->next, col_num++)
2940 {
2941 mip = MakeModifierInfoFromNameAndValue (header_vnp->data.ptrvalue,
2942 col_vnp->data.ptrvalue);
2943 if (mip->modtype == eModifierType_SourceQual
2944 && IsNonTextModifier (mip->name))
2945 {
2946 if (StringICmp (mip->value, "TRUE") != 0
2947 && StringICmp (mip->value, "FALSE") != 0)
2948 {
2949 if (!StringAlreadyInList (bad_nontext_columns, header_vnp->data.ptrvalue))
2950 {
2951 ValNodeAddPointer (&bad_nontext_columns, col_num, StringSave (header_vnp->data.ptrvalue));
2952 }
2953 }
2954 }
2955 else if (ModifierHasInvalidValue (mip))
2956 {
2957 if (!StringAlreadyInList (bad_value_columns, header_vnp->data.ptrvalue))
2958 {
2959 ValNodeAddPointer (&bad_value_columns, col_num, StringSave (header_vnp->data.ptrvalue));
2960 }
2961 }
2962 mip = ModifierInfoFree (mip);
2963 }
2964 }
2965
2966 if (bad_value_columns != NULL)
2967 {
2968 err_msg = CreateListMessage ("Your file contains invalid values for column",
2969 ". Please edit your file to list valid values.",
2970 bad_value_columns);
2971 Message (MSG_ERROR, err_msg);
2972 rval = FALSE;
2973 }
2974 if (bad_nontext_columns != NULL && rval)
2975 {
2976 err_msg = CreateListMessage ("Your file contains values other than TRUE or FALSE for column",
2977 ". These modifiers do not allow other text. Click OK to "
2978 "discard this text and mark the values as TRUE. If you "
2979 "wish to preserve this text under another modifier, click "
2980 "Cancel and change the column header in your file.",
2981 bad_nontext_columns);
2982 if (ANS_CANCEL == Message (MSG_OKC, err_msg))
2983 {
2984 rval = FALSE;
2985 }
2986 }
2987
2988 bad_value_columns = ValNodeFreeData (bad_value_columns);
2989 bad_nontext_columns = ValNodeFreeData (bad_nontext_columns);
2990 return rval;
2991 }
2992
2993 static Boolean
2994 CheckModifiersForOverwrite
2995 (ValNodePtr header_line,
2996 IDAndTitleEditPtr iatep,
2997 Int4Ptr sequence_numbers,
2998 Int4 num_rows,
2999 BoolPtr erase_where_blank,
3000 BoolPtr parse_multiple)
3001 {
3002 ValNodePtr row_vnp, header_vnp, col_vnp;
3003 CharPtr title_val, data_val;
3004 ValNodePtr blank_column_list = NULL;
3005 ValNodePtr replace_column_list = NULL;
3006 ValNodePtr parse_multi_list = NULL;
3007 Int4 col_num, row_num;
3008 Boolean rval = TRUE;
3009 CharPtr err_msg;
3010 MsgAnswer ans;
3011
3012 if (header_line == NULL || header_line->next == NULL || iatep == NULL
3013 || sequence_numbers == NULL || num_rows < ValNodeLen (header_line->next)
3014 || erase_where_blank == NULL || parse_multiple == NULL)
3015 {
3016 return FALSE;
3017 }
3018
3019 *erase_where_blank = FALSE;
3020 *parse_multiple = FALSE;
3021
3022 for (row_vnp = header_line->next, row_num = 0;
3023 row_vnp != NULL && row_num < num_rows;
3024 row_vnp = row_vnp->next, row_num++)
3025 {
3026 if (row_vnp->data.ptrvalue == NULL)
3027 {
3028 continue;
3029 }
3030 header_vnp = header_line->data.ptrvalue;
3031 col_vnp = row_vnp->data.ptrvalue;
3032
3033 /* skip ID column */
3034 header_vnp = header_vnp->next;
3035 col_vnp = col_vnp->next;
3036
3037 col_num = 1;
3038 while (header_vnp != NULL && col_vnp != NULL)
3039 {
3040 /* if column name is blank, skip */
3041 if (header_vnp->data.ptrvalue != NULL)
3042 {
3043 title_val = FindValueFromPairInDefline (header_vnp->data.ptrvalue,
3044 iatep->title_list [sequence_numbers[row_num]]);
3045 data_val = col_vnp->data.ptrvalue;
3046 if (!StringHasNoText (title_val))
3047 {
3048 if (StringHasNoText (data_val))
3049 {
3050 /* add to list of possible erasures */
3051 if (!StringAlreadyInList (blank_column_list, header_vnp->data.ptrvalue))
3052 {
3053 ValNodeAddPointer (&blank_column_list, col_num, StringSave (header_vnp->data.ptrvalue));
3054 }
3055 }
3056 else if (StringCmp (data_val, title_val) != 0)
3057 {
3058 /* add to list of possible replacements */
3059 if (!StringAlreadyInList (replace_column_list, header_vnp->data.ptrvalue))
3060 {
3061 ValNodeAddPointer (&replace_column_list, col_num, StringSave (header_vnp->data.ptrvalue));
3062 }
3063 }
3064 }
3065 title_val = MemFree (title_val);
3066 /* check for multival parsing */
3067 if (data_val != NULL
3068 && data_val [0] == '(' && data_val [StringLen (data_val) - 1] == ')'
3069 && StringChr (data_val, ';') != NULL
3070 && !StringAlreadyInList (parse_multi_list, header_vnp->data.ptrvalue))
3071 {
3072 ValNodeAddPointer (&parse_multi_list, col_num, StringSave (header_vnp->data.ptrvalue));
3073 }
3074 }
3075 header_vnp = header_vnp->next;
3076 col_vnp = col_vnp->next;
3077 col_num++;
3078 }
3079 }
3080
3081 if (replace_column_list != NULL)
3082 {
3083 err_msg = CreateListMessage ("Record already contains values for column",
3084 " also found in the import table.\n"
3085 "Do you wish to overwrite these values?",
3086 replace_column_list);
3087 if (ANS_NO == Message (MSG_YN, err_msg))
3088 {
3089 rval = FALSE;
3090 }
3091 err_msg = MemFree (err_msg);
3092 }
3093
3094 if (blank_column_list != NULL && rval)
3095 {
3096 err_msg = CreateListMessage ("Your import table contains blanks in column",
3097 " where data already exists in the sequences.\n"
3098 "Do you wish to erase these values in the sequences?\n"
3099 "If you say no, the old values will remain.",
3100 blank_column_list);
3101 ans = Message (MSG_YNC, err_msg);
3102 err_msg = MemFree (err_msg);
3103 if (ans == ANS_CANCEL)
3104 {
3105 rval = FALSE;
3106 }
3107 else if (ans == ANS_YES)
3108 {
3109 *erase_where_blank = TRUE;
3110 }
3111 }
3112
3113 #if 0
3114 /* ability to parse multiple entry format removed (for now) */
3115 if (parse_multi_list != NULL && rval)
3116 {
3117 err_msg = CreateListMessage ("Your import table contains values in column",
3118 " where the values are in form '(value1;value2)'.\n"
3119 "Do you wish to parse these values into multiple modifiers?\n"
3120 "If you say no, the values will be applied to a single modifier.",
3121 parse_multi_list);
3122 ans = Message (MSG_YNC, err_msg);
3123 err_msg = MemFree (err_msg);
3124 if (ans == ANS_CANCEL)
3125 {
3126 rval = FALSE;
3127 }
3128 else if (ans == ANS_YES)
3129 {
3130 *parse_multiple = TRUE;
3131 }
3132 }
3133 #endif
3134
3135 blank_column_list = ValNodeFree (blank_column_list);
3136 replace_column_list = ValNodeFreeData (replace_column_list);
3137 parse_multi_list = ValNodeFreeData (parse_multi_list);
3138
3139 return rval;
3140 }
3141
3142 static Boolean ImportModifiersToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3143 {
3144 ValNodePtr header_line, row_vnp, col_vnp, header_vnp;
3145 Boolean erase_where_blank = FALSE, parse_multi = FALSE;
3146 Int4Ptr sequence_numbers;
3147 Int4 num_rows, row_number;
3148
3149 if (iatep == NULL)
3150 {
3151 return FALSE;
3152 }
3153
3154 SendHelpScrollMessage (helpForm, "Organism Page", "Import Source Modifiers");
3155
3156 header_line = ReadRowListFromFile ();
3157 if (header_line == NULL || header_line->next == NULL)
3158 {
3159 header_line = FreeTableDisplayRowList (header_line);
3160 return FALSE;
3161 }
3162
3163 header_vnp = header_line->data.ptrvalue;
3164 if (header_vnp == NULL || header_vnp->next == NULL)
3165 {
3166 header_line = FreeTableDisplayRowList (header_line);
3167 return FALSE;
3168 }
3169
3170 num_rows = ValNodeLen (header_line->next);
3171 sequence_numbers = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
3172
3173 if (!ValidateModifierTableSequenceIDs (header_line, iatep, sequence_numbers, &num_rows))
3174 {
3175 header_line = FreeTableDisplayRowList (header_line);
3176 sequence_numbers = MemFree (sequence_numbers);
3177 return FALSE;
3178 }
3179
3180 /* first, validate all column names and values */
3181 if (!ValidateImportModifierColumnNames (header_line))
3182 {
3183 header_line = FreeTableDisplayRowList (header_line);
3184 sequence_numbers = MemFree (sequence_numbers);
3185 return FALSE;
3186 }
3187
3188 if (!ValidateTableValues (header_line))
3189 {
3190 header_line = FreeTableDisplayRowList (header_line);
3191 sequence_numbers = MemFree (sequence_numbers);
3192 return FALSE;
3193 }
3194
3195 if (!CheckModifiersForOverwrite (header_line, iatep,
3196 sequence_numbers, num_rows,
3197 &erase_where_blank, &parse_multi))
3198 {
3199 header_line = FreeTableDisplayRowList (header_line);
3200 sequence_numbers = MemFree (sequence_numbers);
3201 return FALSE;
3202 }
3203
3204 /* now apply */
3205 for (row_vnp = header_line->next, row_number = 0;
3206 row_vnp != NULL && row_number < num_rows;
3207 row_vnp = row_vnp->next, row_number++)
3208 {
3209 if (row_vnp->data.ptrvalue == NULL)
3210 {
3211 continue;
3212 }
3213 header_vnp = header_line->data.ptrvalue;
3214 col_vnp = row_vnp->data.ptrvalue;
3215
3216 /* skip the ID column */
3217 header_vnp = header_vnp->next;
3218 col_vnp = col_vnp->next;
3219
3220 for (;
3221 header_vnp != NULL && col_vnp != NULL;
3222 header_vnp = header_vnp->next, col_vnp = col_vnp->next)
3223 {
3224 iatep->title_list [sequence_numbers [row_number]] = ApplyImportModToTitle (iatep->title_list [sequence_numbers[row_number]],
3225 header_vnp->data.ptrvalue,
3226 col_vnp->data.ptrvalue,
3227 erase_where_blank,
3228 parse_multi);
3229 }
3230 }
3231 sequence_numbers = MemFree (sequence_numbers);
3232 return TRUE;
3233 }
3234
3235 typedef struct fastapage {
3236 DIALOG_MESSAGE_BLOCK
3237 Char path [PATH_MAX];
3238 SeqEntryPtr list;
3239 ValNodePtr errmsgs;
3240 DoC doc;
3241 GrouP instructions;
3242 GrouP have_seq_instr_grp;
3243 GrouP singleIdGrp;
3244 TexT singleSeqID;
3245 Boolean is_na;
3246 Boolean is_mrna;
3247 Boolean is_delta;
3248 Boolean parseSeqId;
3249 Boolean single;
3250 Int2Ptr seqPackagePtr;
3251 ButtoN import_btn;
3252 ButtoN clear_btn;
3253 } FastaPage, PNTR FastaPagePtr;
3254
3255 static ParData faParFmt = {FALSE, FALSE, FALSE, FALSE, FALSE, 0, 0};
3256 static ColData faColFmt = {0, 0, 80, 0, NULL, 'l', TRUE, FALSE, FALSE, FALSE, TRUE};
3257
3258 static void ResetFastaPage (FastaPagePtr fpp)
3259
3260 {
3261 SeqEntryPtr next;
3262 SeqEntryPtr sep;
3263
3264 if (fpp != NULL) {
3265 sep = fpp->list;
3266 while (sep != NULL) {
3267 next = sep->next;
3268 sep->next = NULL;
3269 SeqEntryFree (sep);
3270 sep = next;
3271 }
3272 fpp->list = NULL;
3273 fpp->errmsgs = ValNodeFreeData (fpp->errmsgs);
3274 }
3275 }
3276
3277 static CharPtr GetModValueFromSeqEntry (SeqEntryPtr sep, CharPtr mod_name)
3278 {
3279 CharPtr ttl = NULL;
3280 CharPtr value = NULL;
3281
3282 if (sep == NULL || StringHasNoText (mod_name))
3283 {
3284 return NULL;
3285 }
3286
3287 SeqEntryExplore (sep, (Pointer) (&ttl), FindFirstTitle);
3288 if (StringHasNoText (ttl))
3289 {
3290 return NULL;
3291 }
3292
3293 value = FindValueFromPairInDefline (mod_name, ttl);
3294
3295 return value;
3296 }
3297
3298 static void AddReportLine (CharPtr str, CharPtr name, CharPtr tmp)
3299
3300 {
3301 StringCat (str, name);
3302 StringCat (str, ": ");
3303 StringCat (str, tmp);
3304 StringCat (str, "\n");
3305 }
3306
3307 static CharPtr GetDisplayValue (CharPtr mod_name, CharPtr title, BoolPtr multi_found);
3308 static CharPtr GetDisplayValueFromModifierInfoList (CharPtr mod_name, ValNodePtr modifier_info_list, BoolPtr multi_found);
3309
3310 static void ReportModifiers (CharPtr str, CharPtr report_name,
3311 ValNodePtr modifier_info_list, CharPtr mod_name, CharPtr not_found_msg)
3312 {
3313 CharPtr valstr;
3314 Boolean multi_found = TRUE;
3315
3316 valstr = GetDisplayValueFromModifierInfoList (mod_name, modifier_info_list, &multi_found);
3317 if (IsNonTextModifier (mod_name) && StringICmp (valstr, "FALSE") == 0)
3318 {
3319 valstr = MemFree (valstr);
3320 }
3321
3322 if (!StringHasNoText (valstr)) {
3323 AddReportLine (str, report_name, valstr);
3324 } else if (!StringHasNoText (not_found_msg)) {
3325 StringCat (str, not_found_msg);
3326 }
3327 valstr = MemFree (valstr);
3328 }
3329
3330 static void LookupAndAddReportLine (CharPtr str, CharPtr report_name,
3331 CharPtr title, CharPtr mod_name, CharPtr not_found_msg)
3332 {
3333 CharPtr valstr;
3334 Boolean multi_found = TRUE;
3335
3336 valstr = GetDisplayValue (mod_name, title, &multi_found);
3337 if (IsNonTextModifier (mod_name) && StringICmp (valstr, "FALSE") == 0)
3338 {
3339 valstr = MemFree (valstr);
3340 }
3341
3342 if (!StringHasNoText (valstr)) {
3343 AddReportLine (str, report_name, valstr);
3344 } else if (!StringHasNoText (not_found_msg)) {
3345 StringCat (str, not_found_msg);
3346 }
3347 valstr = MemFree (valstr);
3348 }
3349
3350 static void LookupAndAddLocationReportLine (CharPtr str, CharPtr title)
3351 {
3352 CharPtr valstr;
3353
3354 valstr = FindValueFromPairInDefline ("location", title);
3355 if (!StringHasNoText (valstr) && StringICmp (valstr, "genomic") != 0) {
3356 AddReportLine (str, "Location", valstr);
3357 }
3358 valstr = MemFree (valstr);
3359 }
3360
3361 static CharPtr singlewarn = "\
3362 ERROR - You may not enter multiple segments for a single sequence submission.\
3363 You should either clear the nucleotide and import a single FASTA record, or\
3364 return to the Sequence Format form and choose the proper submission type.\n\n";
3365
3366 #define FastaFormatBufLen 2000
3367
3368 static Int4 CountSegSetSegments (SeqEntryPtr sep)
3369 {
3370 BioseqSetPtr bssp;
3371
3372 if (sep == NULL || sep->data.ptrvalue == NULL || ! IS_Bioseq_set (sep))
3373 {
3374 return 0;
3375 }
3376
3377 bssp = (BioseqSetPtr) sep->data.ptrvalue;
3378 if (bssp->_class != BioseqseqSet_class_segset)
3379 {
3380 return 0;
3381 }
3382 sep = bssp->seq_set;
3383
3384 while (sep != NULL)
3385 {
3386 if (IS_Bioseq_set (sep) && sep->data.ptrvalue != NULL)
3387 {
3388 bssp = (BioseqSetPtr) sep->data.ptrvalue;
3389 if (bssp->_class == BioseqseqSet_class_parts)
3390 {
3391 return ValNodeLen (bssp->seq_set);
3392 }
3393 }
3394 sep = sep->next;
3395 }
3396 return 0;
3397 }
3398
3399 static void FormatFastaDoc (FastaPagePtr fpp)
3400
3401 {
3402 Nlm_QualNameAssocPtr ap;
3403 BioseqPtr bsp;
3404 Boolean hasErrors;
3405 CharPtr label;
3406 Int4 len;
3407 CharPtr measure;
3408 SeqEntryPtr nsep = NULL;
3409 Int2 num;
3410 CharPtr plural;
3411 CharPtr ptr;
3412 SeqIdPtr sip;
3413 SeqEntryPtr sep;
3414 CharPtr str;
3415 CharPtr title;
3416 CharPtr ttl;
3417 CharPtr tmp;
3418 ValNodePtr vnp;
3419 Int4 num_seg;
3420 CharPtr valstr;
3421 ValNodePtr modifier_info_list = NULL;
3422
3423 if (fpp != NULL) {
3424 str = MemNew (sizeof (char) * FastaFormatBufLen);
3425 tmp = MemNew (sizeof (char) * FastaFormatBufLen);
3426 if (str == NULL || tmp == NULL) return;
3427 num = 0;
3428 len = 0;
3429 hasErrors = FALSE;
3430 for (sep = fpp->list; sep != NULL; sep = sep->next) {
3431 num++;
3432 if (IS_Bioseq (sep)) {
3433 bsp = (BioseqPtr) sep->data.ptrvalue;
3434 if (bsp != NULL) {
3435 len += bsp->length;
3436 }
3437 } else if (IS_Bioseq_set (sep)) {
3438 nsep = FindNucSeqEntry (sep);
3439 if (nsep != NULL && IS_Bioseq (nsep)) {
3440 bsp = (BioseqPtr) nsep->data.ptrvalue;
3441 if (bsp != NULL) {
3442 len += bsp->length;
3443 }
3444 }
3445 }
3446 }
3447 if (num > 1) {
3448 plural = "s";
3449 } else {
3450 plural = "";
3451 }
3452 if (fpp->single && num > 1) {
3453 AppendText (fpp->doc, singlewarn, &faParFmt, &faColFmt, programFont);
3454 hasErrors = TRUE;
3455 }
3456 if (fpp->is_mrna) {
3457 label = "Message";
3458 measure = "nucleotides";
3459 } else if (fpp->is_na) {
3460 label = "Sequence";
3461 measure = "bases";
3462 } else {
3463 label = "Sequence";
3464 measure = "amino acids";
3465 }
3466 if (fpp->is_mrna) {
3467 sprintf (str, "%d transcript sequence%s, total length %ld %s\n",
3468 (int) num, plural, (long) len, measure);
3469 } else if (fpp->is_na) {
3470 sprintf (str, "%d nucleotide sequence%s, total length %ld %s\n",
3471 (int) num, plural, (long) len, measure);
3472 } else {
3473 sprintf (str, "%d protein sequence%s, total length %ld %s\n",
3474 (int) num, plural, (long) len, measure);
3475 }
3476 AppendText (fpp->doc, str, &faParFmt, &faColFmt, programFont);
3477 vnp = fpp->errmsgs;
3478 num = 0;
3479 for (sep = fpp->list; sep != NULL; sep = sep->next) {
3480 num++;
3481 len = 0;
3482 num_seg = CountSegSetSegments (sep);
3483 sip = NULL;
3484 tmp [0] = '\0';
3485 if (IS_Bioseq (sep)) {
3486 bsp = (BioseqPtr) sep->data.ptrvalue;
3487 if (bsp != NULL) {
3488 len = bsp->length;
3489 sip = SeqIdFindWorst (bsp->id);
3490 SeqIdWrite (sip, tmp, PRINTID_REPORT, FastaFormatBufLen);
3491 }
3492 nsep = sep;
3493 } else if (IS_Bioseq_set (sep)) {
3494 nsep = FindNucSeqEntry (sep);
3495 if (nsep != NULL && IS_Bioseq (nsep)) {
3496 bsp = (BioseqPtr) nsep->data.ptrvalue;
3497 if (bsp != NULL) {
3498 len = bsp->length;
3499 sip = SeqIdFindWorst (bsp->id);
3500 SeqIdWrite (sip, tmp, PRINTID_REPORT, FastaFormatBufLen);
3501 }
3502 }
3503 }
3504
3505 /* if segmented set, show number of segments */
3506 if (num_seg > 0)
3507 {
3508 sprintf (str, "\nSegset %d Sequence ID: %s\nLength: %ld %s (%d segments)\n",
3509 (int) num, tmp, (long) len, measure, num_seg);
3510 }
3511 else
3512 {
3513 sprintf (str, "\n%s %d Sequence ID: %s\nLength: %ld %s\n", label,
3514 (int) num, tmp, (long) len, measure);
3515 }
3516 ttl = NULL;
3517 SeqEntryExplore (nsep, (Pointer) (&ttl), FindFirstTitle);
3518 title = StringSaveNoNull (ttl);
3519 modifier_info_list = ParseAllBracketedModifiers (title);
3520 if (title != NULL && (! fpp->is_na)) {
3521
3522 ReportModifiers (str, "Gene", modifier_info_list, "gene", "No gene name detected\n");
3523 ReportModifiers (str, "Protein", modifier_info_list, "protein", "No protein name detected\n");
3524 ReportModifiers (str, "Gene Syn", modifier_info_list, "gene_syn", NULL);
3525 ReportModifiers (str, "Protein Desc", modifier_info_list, "protein_desc", NULL);
3526
3527 ptr = StringISearch (title, "[orf]");
3528 if (ptr != NULL) {
3529 StringCat (str, "ORF indicated\n");
3530 }
3531 ReportModifiers (str, "Protein Comment", modifier_info_list, "comment", NULL);
3532 }
3533
3534 if (title != NULL && fpp->is_na && (! fpp->is_mrna)) {
3535 ReportModifiers (str, "Organism", modifier_info_list, "organism", NULL);
3536 ReportModifiers (str, "Lineage", modifier_info_list, "lineage", NULL);
3537 for (ap = current_orgmod_subtype_alist; ap->name != NULL; ap++) {
3538 ReportModifiers (str, ap->name, modifier_info_list, ap->name, NULL);
3539 }
3540 for (ap = current_subsource_subtype_alist; ap->name != NULL; ap++) {
3541 ReportModifiers (str, ap->name, modifier_info_list, ap->name, NULL);
3542 }
3543 LookupAndAddReportLine (str, "Note", title, "note", NULL);
3544 LookupAndAddReportLine (str, "Note", title, "subsource", NULL);
3545 LookupAndAddReportLine (str, "Molecule", title, "molecule", NULL);
3546 LookupAndAddReportLine (str, "MolType", title, "moltype", NULL);
3547 LookupAndAddLocationReportLine (str, title);
3548 LookupAndAddReportLine (str, "Genetic Code", title, "genetic_code", NULL);
3549 }
3550
3551 if (title != NULL && fpp->is_na && fpp->is_mrna) {
3552 LookupAndAddReportLine (str, "Gene", title, "gene", "No gene name detected\n");
3553 valstr = FindValueFromPairInDefline ("mrna", title);
3554 if (!StringHasNoText (valstr)) {
3555 AddReportLine (str, "mRNA", valstr);
3556 valstr = MemFree (valstr);
3557 } else {
3558 valstr = MemFree (valstr);
3559 valstr = FindValueFromPairInDefline ("cdna", title);
3560 if (!StringHasNoText (valstr)) {
3561 AddReportLine (str, "cDNA", valstr);
3562 } else {
3563 StringCat (str, "No mRNA name detected\n");
3564 }
3565 valstr = MemFree (valstr);
3566 }
3567 LookupAndAddReportLine (str, "Comment", title, "comment", NULL);
3568 }
3569 MemFree (title);
3570 ttl = NULL;
3571 SeqEntryExplore (nsep, (Pointer) (&ttl), FindFirstTitle);
3572 title = StringSaveNoNull (ttl);
3573 if (title != NULL) {
3574 RemoveRecognizedModifiersFromTitle (title, modifier_info_list, fpp->is_na);
3575 if (fpp->is_mrna) {
3576 StripAllInstancesOfModNameFromTitle ("gene", title);
3577 StripAllInstancesOfModNameFromTitle ("mrna", title);
3578 StripAllInstancesOfModNameFromTitle ("cdna", title);
3579 StripAllInstancesOfModNameFromTitle ("comment", title);
3580 }
3581 TrimSpacesAroundString (title);
3582 if (! StringHasNoText (title)) {
3583 StringCat (str, "Title: ");
3584 StringNCat (str, title, 128);
3585 StringCat (str, "\n");
3586 } else {
3587 StringCat (str, "No title detected\n");
3588 }
3589 }
3590 MemFree (title);
3591 ModifierInfoListFree (modifier_info_list);
3592 if (vnp != NULL && vnp->data.ptrvalue != NULL) {
3593 hasErrors = TRUE;
3594 StringCat (str, (CharPtr) vnp->data.ptrvalue);
3595 StringCat (str, "\n");
3596 }
3597 AppendText (fpp->doc, str, &faParFmt, &faColFmt, programFont);
3598 if (vnp != NULL) {
3599 vnp = vnp->next;
3600 }
3601 }
3602 MemFree (str);
3603 MemFree (tmp);
3604 UpdateDocument (fpp->doc, 0, 0);
3605 if (hasErrors) {
3606 Beep ();
3607 Beep ();
3608 Beep ();
3609 }
3610 }
3611 }
3612
3613 extern SeqEntryPtr ImportOneGappedSequence (FILE *fp)
3614 {
3615 BioseqPtr bsp;
3616 Pointer dataptr;
3617 Uint2 datatype;
3618 SeqEntryPtr topsep;
3619 SeqSubmitPtr ssp;
3620 ErrSev oldsev;
3621
3622 if (fp == NULL) return NULL;
3623
3624 oldsev = ErrSetMessageLevel (SEV_MAX);
3625 bsp = ReadDeltaFasta (fp, NULL);
3626 ErrSetMessageLevel (oldsev);
3627 if (bsp == NULL)
3628 {
3629 topsep = NULL;
3630 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE,
3631 TRUE, FALSE);
3632 if (dataptr != NULL)
3633 {
3634 /* Get a pointer to the new SeqEntry */
3635 if (datatype == OBJ_SEQENTRY)
3636 {
3637 topsep = (SeqEntryPtr) dataptr;
3638 }
3639 else if (datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET)
3640 {
3641 topsep = SeqMgrGetSeqEntryForData (dataptr);
3642 }
3643 else if (datatype == OBJ_SEQSUB)
3644 {
3645 ssp = (SeqSubmitPtr) dataptr;
3646 if (ssp != NULL && ssp->datatype == 1)
3647 {
3648 topsep = (SeqEntryPtr) ssp->data;
3649 }
3650 }
3651 }
3652 }
3653 else
3654 {
3655 topsep = SeqMgrGetSeqEntryForData (bsp);
3656 }
3657
3658 return topsep;
3659 }
3660
3661 static SeqEntryPtr SegsetFromSeqEntryList (SeqEntryPtr list)
3662 {
3663 SeqEntryPtr first_sep, tmp_sep, next_sep;
3664 BioseqPtr bsp;
3665 SeqDescrPtr sdp = NULL, set_sdp;
3666
3667 if (list == NULL)
3668 {
3669 return NULL;
3670 }
3671
3672 first_sep = list;
3673 next_sep = first_sep->next;
3674 first_sep->next = NULL;
3675
3676 /* grab title on first sequence to put on segmented bioseq */
3677 if (IS_Bioseq (first_sep) && first_sep->data.ptrvalue != NULL)
3678 {
3679 bsp = (BioseqPtr) first_sep->data.ptrvalue;
3680 sdp = bsp->descr;
3681 while (sdp != NULL && sdp->choice != Seq_descr_title)
3682 {
3683 sdp = sdp->next;
3684 }
3685 }
3686
3687 while (next_sep != NULL)
3688 {
3689 tmp_sep = next_sep;
3690 next_sep = tmp_sep->next;
3691 tmp_sep->next = NULL;
3692 AddSeqEntryToSeqEntry (first_sep, tmp_sep, TRUE);
3693 }
3694
3695 if (sdp != NULL && IS_Bioseq_set (first_sep))
3696 {
3697 tmp_sep = FindNucSeqEntry (first_sep);
3698 if (tmp_sep != NULL && IS_Bioseq (tmp_sep) && tmp_sep->data.ptrvalue != NULL)
3699 {
3700 bsp = tmp_sep->data.ptrvalue;
3701 set_sdp = bsp->descr;
3702 while (set_sdp != NULL && set_sdp->choice != Seq_descr_title)
3703 {
3704 set_sdp = set_sdp->next;
3705 }
3706 if (set_sdp == NULL)
3707 {
3708 set_sdp = CreateNewDescriptor (tmp_sep, Seq_descr_title);
3709 }
3710 if (set_sdp != NULL && StringHasNoText (set_sdp->data.ptrvalue))
3711 {
3712 /* make a copy, rather than removing the segment title */
3713 set_sdp->data.ptrvalue = MemFree (set_sdp->data.ptrvalue);
3714 set_sdp->data.ptrvalue = StringSave (sdp->data.ptrvalue);
3715 }
3716 }
3717 }
3718
3719 return first_sep;
3720 }
3721
3722 static void ReplaceFakeIDWithIDFromTitle (BioseqPtr bsp);
3723
3724 static SeqEntryPtr
3725 ReadOneSegSet
3726 (FILE *fp,
3727 Boolean parse_id,
3728 ValNodePtr PNTR err_msg_list,
3729 BoolPtr chars_stripped)
3730 {
3731 SeqEntryPtr nextsep;
3732 CharPtr errormsg = NULL;
3733 Char lastchar;
3734 SeqEntryPtr seg_list = NULL, seg_list_last = NULL;
3735 BioseqPtr bsp;
3736
3737 if (fp == NULL)
3738 {
3739 return NULL;
3740 }
3741
3742 /* note - we pass in FALSE for parse_id in SequinFastaToSeqEntryEx
3743 * because we do not want to use Sequin's auto-generated sequence IDs.
3744 * We then parse the sequence ID from the title ourselves using
3745 * ReplaceFakeIDWithIDFromTitle if parse_id is TRUE, or leave the ID
3746 * as blank to force the user to select a real ID later.
3747 */
3748 nextsep = SequinFastaToSeqEntryExEx (fp, TRUE, &errormsg, FALSE, &lastchar, chars_stripped);
3749 while (nextsep != NULL ||
3750 (lastchar != (Char) EOF && lastchar != NULLB && lastchar != (Char) 255
3751 && lastchar != ']'))
3752 {
3753 if (nextsep != NULL)
3754 {
3755 /* replace fake ID with ID from title */
3756 if (IS_Bioseq (nextsep) && nextsep->data.ptrvalue != NULL)
3757 {
3758 bsp = (BioseqPtr) nextsep->data.ptrvalue;
3759 if (parse_id)
3760 {
3761 ReplaceFakeIDWithIDFromTitle ((BioseqPtr) nextsep->data.ptrvalue);
3762 }
3763 else
3764 {
3765 bsp->id = SeqIdFree (bsp->id);
3766 }
3767 }
3768 SeqEntryPack (nextsep);
3769 if (seg_list_last == NULL)
3770 {
3771 seg_list = nextsep;
3772 }
3773 else
3774 {
3775 seg_list_last->next = nextsep;
3776 }
3777 seg_list_last = nextsep;
3778
3779 ValNodeAddPointer (err_msg_list, 0, errormsg);
3780 errormsg = NULL;
3781 }
3782 nextsep = SequinFastaToSeqEntryExEx (fp, TRUE, &errormsg, FALSE, &lastchar, chars_stripped);
3783 }
3784 nextsep = SegsetFromSeqEntryList (seg_list);
3785 return nextsep;
3786 }
3787
3788 static void AddDefaultMoleculeTypeToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3789 {
3790 Int4 seq_num;
3791 CharPtr old_value;
3792
3793 if (iatep == NULL)
3794 {
3795 return;
3796 }
3797
3798 for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
3799 {
3800 if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
3801 {
3802 continue;
3803 }
3804 old_value = FindValueFromPairInDefline("moltype",
3805 iatep->title_list [seq_num]);
3806 if (StringHasNoText (old_value) || StringICmp (old_value, "dna") == 0)
3807 {
3808 iatep->title_list [seq_num] = ReplaceValueInOneDefLine(iatep->title_list [seq_num],
3809 "moltype",
3810 "Genomic DNA");
3811 }
3812 old_value = MemFree (old_value);
3813 }
3814 }
3815
3816 static void AddDefaultLocationToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3817 {
3818 Int4 seq_num;
3819 CharPtr old_value, first_organism, next_org_loc = NULL, org_stop;
3820
3821 if (iatep == NULL)
3822 {
3823 return;
3824 }
3825
3826 for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
3827 {
3828 if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
3829 {
3830 continue;
3831 }
3832 first_organism = FindValuePairInDefLine ("organism", iatep->title_list [seq_num], &org_stop);
3833 if (first_organism != NULL)
3834 {
3835 next_org_loc = FindValuePairInDefLine ("organism", org_stop + 1, NULL);
3836 }
3837 else
3838 {
3839 next_org_loc = NULL;
3840 }
3841 old_value = FindValueFromPairInDeflineBeforeCharPtr ("location",
3842 iatep->title_list [seq_num],
3843 next_org_loc);
3844 if (StringHasNoText (old_value))
3845 {
3846 iatep->title_list [seq_num] = ReplaceValueInOneDefLineForOrganism (iatep->title_list [seq_num],
3847 "location",
3848 "genomic",
3849 first_organism);
3850 }
3851 old_value = MemFree (old_value);
3852 }
3853 }
3854
3855 static void AddDefaultTopologyToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3856 {
3857 Int4 seq_num;
3858 CharPtr old_value;
3859
3860 if (iatep == NULL)
3861 {
3862 return;
3863 }
3864
3865 for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
3866 {
3867 if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
3868 {
3869 continue;
3870 }
3871 old_value = FindValueFromPairInDefline ("topology",
3872 iatep->title_list [seq_num]);
3873 if (StringHasNoText (old_value))
3874 {
3875 iatep->title_list [seq_num] = ReplaceValueInOneDefLine(iatep->title_list [seq_num],
3876 "topology",
3877 "Linear");
3878 }
3879 old_value = MemFree (old_value);
3880 }
3881 }
3882
3883 static void AddDefaultGeneticCodesToIDAndTitleEdit (IDAndTitleEditPtr iatep)
3884 {
3885 CharPtr taxname, location, gcode_name;
3886 Int4 gcode;
3887 ValNodePtr gencodelist;
3888 Int4 seq_num;
3889 CharPtr first_organism, next_org_loc = NULL, org_stop;
3890
3891 if (iatep == NULL)
3892 {
3893 return;
3894 }
3895
3896 gencodelist = GetGeneticCodeValNodeList ();
3897 for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
3898 {
3899 if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
3900 {
3901 continue;
3902 }
3903 first_organism = FindValuePairInDefLine ("organism", iatep->title_list [seq_num], &org_stop);
3904 if (first_organism != NULL)
3905 {
3906 next_org_loc = FindValuePairInDefLine ("organism", org_stop + 1, NULL);
3907 }
3908 else
3909 {
3910 next_org_loc = NULL;
3911 }
3912
3913 taxname = FindValueFromPairInDefline ("organism", first_organism);
3914 location = FindValueFromPairInDeflineBeforeCharPtr ("location",
3915 iatep->title_list [seq_num],
3916 next_org_loc);
3917 if (StringHasNoText (location))
3918 {
3919 location = StringSave ("genomic");
3920 }
3921
3922 gcode = GetGeneticCodeForTaxNameAndLocation (taxname, location);
3923 taxname = MemFree (taxname);
3924 location = MemFree (location);
3925
3926 if (gcode < 0)
3927 {
3928 gcode_name = FindValueFromPairInDeflineBeforeCharPtr ("genetic_code",
3929 iatep->title_list [seq_num],
3930 next_org_loc);
3931 if (StringHasNoText (gcode_name))
3932 {
3933 gcode_name = MemFree (gcode_name);
3934 gcode_name = GeneticCodeStringFromIntAndList (1, gencodelist);
3935 iatep->title_list [seq_num] = ReplaceValueInOneDefLineForOrganism (iatep->title_list [seq_num],
3936 "genetic_code",
3937 gcode_name,
3938 first_organism);
3939 }
3940 else
3941 {
3942 gcode_name = MemFree (gcode_name);
3943 }
3944 }
3945 else
3946 {
3947 gcode_name = GeneticCodeStringFromIntAndList (gcode, gencodelist);
3948 iatep->title_list [seq_num] = ReplaceValueInOneDefLineForOrganism (iatep->title_list [seq_num],
3949 "genetic_code",
3950 gcode_name,
3951 first_organism);
3952 }
3953 }
3954 ValNodeFreeData (gencodelist);
3955 }
3956
3957 static void AddDefaultModifierValues (SeqEntryPtr seq_list)
3958 {
3959 IDAndTitleEditPtr iatep;
3960
3961 iatep = SeqEntryListToIDAndTitleEdit (seq_list);
3962 AddDefaultMoleculeTypeToIDAndTitleEdit (iatep);
3963 AddDefaultLocationToIDAndTitleEdit (iatep);
3964 AddDefaultTopologyToIDAndTitleEdit (iatep);
3965 AddDefaultGeneticCodesToIDAndTitleEdit (iatep);
3966 ApplyIDAndTitleEditToSeqEntryList (seq_list, iatep);
3967 iatep = IDAndTitleEditFree (iatep);
3968 }
3969
3970 static Boolean HasGapID (SeqEntryPtr sep)
3971 {
3972 BioseqPtr bsp;
3973 Char id_str [128];
3974 Int4 j;
3975
3976 if (sep == NULL || ! IS_Bioseq (sep) || (bsp = sep->data.ptrvalue) == NULL)
3977 {
3978 return FALSE;
3979 }
3980
3981 SeqIdWrite (bsp->id, id_str, PRINTID_REPORT, sizeof (id_str));
3982
3983 if (id_str [0] != '?')
3984 {
3985 return FALSE;
3986 }
3987 if (StringICmp (id_str + 1, "unk100") == 0)
3988 {
3989 return TRUE;
3990 }
3991 else
3992 {
3993 /* make sure there are only numbers after the question mark */
3994 j = 1;
3995 while (isdigit (id_str [j]))
3996 {
3997 j++;
3998 }
3999 if (id_str [j] == 0)
4000 {
4001 return TRUE;
4002 }
4003 else
4004 {
4005 return FALSE;
4006 }
4007 }
4008 }
4009
4010 static Boolean HasNoSeqID (SeqEntryPtr sep)
4011 {
4012 BioseqPtr bsp;
4013
4014 if (sep == NULL || ! IS_Bioseq (sep) || (bsp = sep->data.ptrvalue) == NULL)
4015 {
4016 return FALSE;
4017 }
4018 if (bsp->id == NULL)
4019 {
4020 return TRUE;
4021 }
4022 else
4023 {
4024 return FALSE;
4025 }
4026 }
4027
4028 static void PutDeflineIDBackInTitle (BioseqPtr bsp)
4029 {
4030 SeqDescrPtr sdp;
4031 CharPtr id_txt;
4032 CharPtr title_txt;
4033
4034 if (bsp == NULL || bsp->id == NULL)
4035 {
4036 return;
4037 }
4038
4039 sdp = bsp->descr;
4040 while (sdp != NULL && sdp->choice != Seq_descr_title)
4041 {
4042 sdp = sdp->next;
4043 }
4044 if (sdp == NULL)
4045 {
4046 sdp = CreateNewDescriptorOnBioseq (bsp, Seq_descr_title);
4047 }
4048 if (sdp == NULL)
4049 {
4050 return;
4051 }
4052
4053 id_txt = SeqIdWholeLabel (bsp->id, PRINTID_REPORT);
4054
4055 if (StringHasNoText (sdp->data.ptrvalue))
4056 {
4057 sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
4058 sdp->data.ptrvalue = id_txt;
4059 }
4060 else
4061 {
4062 title_txt = (CharPtr) MemNew (sizeof (Char) * (StringLen (id_txt) + StringLen (sdp->data.ptrvalue) + 2));
4063 StringCpy (title_txt, id_txt);
4064 StringCat (title_txt, " ");
4065 StringCat (title_txt, sdp->data.ptrvalue);
4066 sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
4067 sdp->data.ptrvalue = title_txt;
4068 id_txt = MemFree (id_txt);
4069 }
4070
4071 bsp->id = SeqIdFree (bsp->id);
4072 }
4073
4074 static Char GetNextCharacterFromFile (FILE *fp, BoolPtr pIsASN)
4075 {
4076 FileCache fc;
4077 CharPtr str;
4078 Char special_symbol;
4079 Char line [128];
4080 Int4 pos;
4081
4082 /* look ahead to see what character caused inability to interpret line */
4083 FileCacheSetup (&fc, fp);
4084 /* pos = FileCacheTell (&fc); */
4085 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4086 if (str != NULL && StringDoesHaveText (str)) {
4087 TrimSpacesAroundString (str);
4088 }
4089 special_symbol = line [0];
4090 if (pIsASN != NULL)
4091 {
4092 if (StringStr (line, "::=") != NULL)
4093 {
4094 *pIsASN = TRUE;
4095 }
4096 else
4097 {
4098 *pIsASN = FALSE;
4099 }
4100 }
4101 /* seek to start of next line after one that could not be interpreted */
4102 pos = FileCacheTell (&fc);
4103 FileCacheSetup (&fc, fp);
4104 FileCacheSeek (&fc, pos);
4105 fseek (fp, pos, SEEK_SET);
4106 return special_symbol;
4107 }
4108
4109 static Int4 FindLineForStartOfBadRead (FILE *fp, Int4 pos)
4110 {
4111 FileCache fc;
4112 Int4 line_num = 0;
4113 Char line [4096];
4114 CharPtr str;
4115
4116 if (fp == NULL || pos == 0) {
4117 return 0;
4118 }
4119
4120 FileCacheSetup (&fc, fp);
4121 FileCacheSeek (&fc, 0);
4122 fseek (fp, 0, SEEK_SET);
4123 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4124 while (str != NULL && FileCacheTell (&fc) < pos) {
4125 line_num++;
4126 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4127 }
4128 return line_num;
4129 }
4130
4131
4132 static Int4 FindLineForBadReadChar (FILE *fp, Char badchar)
4133 {
4134 FileCache fc;
4135 Int4 line_num = 0;
4136 Char line [4096];
4137 CharPtr str;
4138
4139 if (fp == NULL) {
4140 return 0;
4141 }
4142
4143 FileCacheSetup (&fc, fp);
4144 FileCacheSeek (&fc, 0);
4145 fseek (fp, 0, SEEK_SET);
4146 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4147 while (str != NULL && StringChr (str, badchar) == NULL) {
4148 line_num++;
4149 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4150 }
4151 return line_num;
4152 }
4153
4154
4155 static void CleanTitles (SeqEntryPtr sep, ValNodePtr PNTR special_list)
4156 {
4157 BioseqPtr bsp;
4158 BioseqSetPtr bssp;
4159 SeqDescrPtr sdp = NULL;
4160
4161 while (sep != NULL)
4162 {
4163 sdp = NULL;
4164 if (sep->choice == 1)
4165 {
4166 bsp = sep->data.ptrvalue;
4167 if (bsp != NULL)
4168 {
4169 sdp = bsp->descr;
4170 }
4171 }
4172 else if (sep->choice == 2)
4173 {
4174 bssp = sep->data.ptrvalue;
4175 if (bssp != NULL)
4176 {
4177 CleanTitles (bssp->seq_set, special_list);
4178 sdp = bssp->descr;
4179 }
4180 }
4181 while (sdp != NULL)
4182 {
4183 if (sdp->choice == Seq_descr_title)
4184 {
4185 SpecialCharFindWithContext ((CharPtr PNTR) &(sdp->data.ptrvalue), special_list, NULL, NULL);
4186 }
4187 sdp = sdp->next;
4188 }
4189 sep = sep->next;
4190 }
4191 }
4192
4193
4194 static SeqEntryPtr ImportOnlyProteinSequences
4195 (FILE *fp,
4196 SeqEntryPtr sep_list,
4197 Boolean parse_id,
4198 CharPtr supplied_id_txt,
4199 ValNodePtr PNTR err_msg_list,
4200 BoolPtr chars_stripped)
4201 {
4202 Pointer dataptr;
4203 Uint2 datatype;
4204 SeqEntryPtr new_list = NULL, nextsep, lastsep = NULL, oldscope;
4205 Boolean error_reading = FALSE;
4206 Int4 pos, bad_start;
4207 BioseqPtr bsp;
4208 ValNodePtr special_list = NULL;
4209
4210 oldscope = SeqEntrySetScope (NULL);
4211
4212 pos = ftell (fp);
4213 dataptr = ReadAsnFastaOrFlatFileEx (fp, &datatype, NULL, FALSE, TRUE,
4214 FALSE, FALSE, chars_stripped);
4215 while (dataptr != NULL)
4216 {
4217 bsp = NULL;
4218 if (datatype == OBJ_SEQENTRY)
4219 {
4220 nextsep = (SeqEntryPtr) dataptr;
4221 if (IS_Bioseq(nextsep)) {
4222 bsp = nextsep->data.ptrvalue;
4223 } else {
4224 nextsep = NULL;
4225 }
4226 }
4227 else if (datatype == OBJ_BIOSEQ)
4228 {
4229 bsp = dataptr;
4230 nextsep = SeqMgrGetSeqEntryForData(bsp);
4231 }
4232 if (bsp != NULL)
4233 {
4234 if (parse_id)
4235 {
4236 ReplaceFakeIDWithIDFromTitle ((BioseqPtr) nextsep->data.ptrvalue);
4237 }
4238 else
4239 {
4240 if (!StringHasNoText (supplied_id_txt))
4241 {
4242 bsp->id = MakeSeqID (supplied_id_txt);
4243 }
4244 }
4245 SeqEntryPack (nextsep);
4246 if (lastsep == NULL) {
4247 new_list = nextsep;
4248 } else {
4249 lastsep->next = nextsep;
4250 }
4251 lastsep = nextsep;
4252 pos = ftell (fp);
4253 dataptr = ReadAsnFastaOrFlatFileEx (fp, &datatype, NULL, FALSE, TRUE,
4254 FALSE, FALSE, chars_stripped);
4255 }
4256 else
4257 {
4258 if (dataptr != NULL) {
4259 error_reading = TRUE;
4260 }
4261 dataptr = NULL;
4262 }
4263 }
4264 if (*chars_stripped || error_reading)
4265 {
4266 bad_start = FindLineForStartOfBadRead (fp, pos);
4267 Message (MSG_ERROR, "Unable to read file, starting at line %d!", bad_start);
4268 new_list = SeqEntryFree (new_list);
4269 }
4270 else
4271 {
4272 CleanTitles (new_list, &special_list);
4273 if (!FixSpecialCharactersForStringsInList (special_list, "Definition lines contain special characters.\nThe sequences cannot be imported unless the characters are replaced.", FALSE))
4274 {
4275 new_list = SeqEntryFree (new_list);
4276 }
4277 special_list = FreeContextList (special_list);
4278 }
4279
4280 lastsep = sep_list;
4281 while (lastsep != NULL && lastsep->next != NULL)
4282 {
4283 lastsep = lastsep->next;
4284 }
4285 if (lastsep == NULL)
4286 {
4287 sep_list = new_list;
4288 }
4289 else
4290 {
4291 lastsep->next = new_list;
4292 }
4293
4294 SeqEntrySetScope (oldscope);
4295
4296 return sep_list;
4297
4298 }
4299
4300 extern SeqEntryPtr
4301 ImportSequencesFromFileEx
4302 (FILE *fp,
4303 SeqEntryPtr sep_list,
4304 Boolean is_na,
4305 Boolean parse_id,
4306 CharPtr supplied_id_txt,
4307 ValNodePtr PNTR err_msg_list,
4308 BoolPtr chars_stripped,
4309 Boolean allow_char_stripping)
4310 {
4311 Int4 count;
4312 SeqEntryPtr last;
4313 Char lastchar;
4314 SeqEntryPtr nextsep;
4315 CharPtr errormsg = NULL;
4316 BioseqPtr bsp = NULL;
4317 SeqEntryPtr new_sep_list = NULL;
4318 ErrSev oldsev;
4319 Boolean read_from_delta;
4320 SeqEntryPtr oldscope;
4321 Int4 pos, last_no_id_start = -1;
4322 Boolean this_chars_stripped = FALSE;
4323 Boolean isASN = FALSE, isOnlyFASTA = FALSE;
4324 Int4 bad_start = 0, bad_line = 0;
4325 ValNodePtr special_list = NULL;
4326
4327 if (chars_stripped != NULL)
4328 {
4329 *chars_stripped = FALSE;
4330 }
4331
4332 if (!is_na) {
4333 return ImportOnlyProteinSequences (fp, sep_list, parse_id, supplied_id_txt, err_msg_list, chars_stripped);
4334 }
4335
4336 count = 0;
4337
4338 new_sep_list = NULL;
4339 last = NULL;
4340
4341 oldscope = SeqEntrySetScope (NULL);
4342
4343 pos = ftell (fp);
4344
4345 bsp = NULL;
4346 oldsev = ErrSetMessageLevel (SEV_MAX);
4347 bsp = ReadDeltaFastaEx (fp, NULL, &this_chars_stripped);
4348 if (chars_stripped != NULL)
4349 {
4350 *chars_stripped |= this_chars_stripped;
4351 }
4352 if (bsp != NULL && !parse_id)
4353 {
4354 PutDeflineIDBackInTitle (bsp);
4355 if (!StringHasNoText (supplied_id_txt))
4356 {
4357 bsp->id = MakeSeqID (supplied_id_txt);
4358 }
4359 }
4360 ErrSetMessageLevel (oldsev);
4361
4362 /* note - we pass in FALSE for parse_id in SequinFastaToSeqEntryEx
4363 * because we do not want to use Sequin's auto-generated sequence IDs.
4364 * We then parse the sequence ID from the title ourselves using
4365 * ReplaceFakeIDWithIDFromTitle if parse_id is TRUE, or leave the ID
4366 * as blank to force the user to select a real ID later.
4367 */
4368
4369 if (bsp == NULL)
4370 {
4371 bsp = ReadFastaOnly (fp,
4372 is_na, !is_na,
4373 &this_chars_stripped,
4374 &lastchar);
4375 if (bsp == NULL)
4376 {
4377 if (lastchar == 0)
4378 {
4379 lastchar = GetNextCharacterFromFile (fp, &isASN);
4380 }
4381 } else {
4382 isOnlyFASTA = TRUE;
4383 }
4384
4385 nextsep = SeqMgrGetSeqEntryForData (bsp);
4386 if (chars_stripped != NULL)
4387 {
4388 *chars_stripped |= this_chars_stripped;
4389 }
4390 read_from_delta = FALSE;
4391 }
4392 else
4393 {
4394 nextsep = SeqMgrGetSeqEntryForData (bsp);
4395 lastchar = '\n';
4396 read_from_delta = TRUE;
4397 }
4398 while ((nextsep != NULL ||
4399 (lastchar == '\n' || lastchar == '['))
4400 && !isASN
4401 && (allow_char_stripping || !this_chars_stripped))
4402 {
4403 if (nextsep != NULL)
4404 {
4405 if (!read_from_delta
4406 && IS_Bioseq (nextsep)
4407 && nextsep->data.ptrvalue != NULL)
4408 {
4409 bsp = (BioseqPtr) nextsep->data.ptrvalue;
4410 /* replace fake ID with ID from title for sequences that aren't deltas */
4411 if (parse_id)
4412 {
4413 ReplaceFakeIDWithIDFromTitle ((BioseqPtr) nextsep->data.ptrvalue);
4414 }
4415 else
4416 {
4417 bsp->id = SeqIdFree (bsp->id);
4418 if (!StringHasNoText (supplied_id_txt))
4419 {
4420 bsp->id = MakeSeqID (supplied_id_txt);
4421 }
4422 }
4423 SeqEntryPack (nextsep);
4424 }
4425
4426 if (last_no_id_start > -1)
4427 {
4428 if (HasGapID (nextsep))
4429 {
4430 nextsep = SeqEntryFree (nextsep);
4431 bsp = last->data.ptrvalue;
4432 SeqMgrDeleteFromBioseqIndex (bsp);
4433 bsp = BioseqFree (bsp);
4434 fseek (fp, last_no_id_start, SEEK_SET);
4435 bsp = ReadDeltaFastaWithEmptyDefline (fp, NULL, chars_stripped);
4436 last->data.ptrvalue = bsp;
4437 bsp->id = SeqIdFree (bsp->id);
4438 last_no_id_start = -1;
4439 }
4440 else if (HasNoSeqID (nextsep))
4441 {
4442 last_no_id_start = pos;
4443 }
4444 else
4445 {
4446 last_no_id_start = -1;
4447 }
4448 }
4449 else if (HasNoSeqID (nextsep))
4450 {
4451 last_no_id_start = pos;
4452 }
4453
4454 ValNodeAddPointer (err_msg_list, 0, errormsg);
4455 errormsg = NULL;
4456 }
4457 else if (lastchar == '[')
4458 {
4459 nextsep = ReadOneSegSet (fp, parse_id, err_msg_list, &this_chars_stripped);
4460 if (chars_stripped != NULL)
4461 {
4462 *chars_stripped |= this_chars_stripped;
4463 }
4464 }
4465 if (nextsep != NULL)
4466 {
4467 if (last == NULL)
4468 {
4469 new_sep_list = nextsep;
4470 last = nextsep;
4471 }
4472 else
4473 {
4474 last->next = nextsep;
4475 last = nextsep;
4476 }
4477 }
4478
4479 pos = ftell (fp);
4480 bsp = NULL;
4481 if (isOnlyFASTA)
4482 {
4483 lastchar = (Char) EOF;
4484 nextsep = NULL;
4485 }
4486 else if (is_na)
4487 {
4488 oldsev = ErrSetMessageLevel (SEV_MAX);
4489 bsp = ReadDeltaFastaEx (fp, NULL, &this_chars_stripped);
4490 if (chars_stripped != NULL)
4491 {
4492 *chars_stripped |= this_chars_stripped;
4493 }
4494 ErrSetMessageLevel (oldsev);
4495 if (!parse_id)
4496 {
4497 PutDeflineIDBackInTitle (bsp);
4498 }
4499 }
4500
4501 if (isOnlyFASTA) {
4502 /* done with loop */
4503 }
4504 else if (bsp == NULL)
4505 {
4506 bsp = ReadFastaOnly (fp,
4507 is_na, !is_na,
4508 &this_chars_stripped,
4509 &lastchar);
4510 if (bsp == NULL)
4511 {
4512 if (lastchar == 0)
4513 {
4514 lastchar = GetNextCharacterFromFile (fp, &isASN);
4515 }
4516 }
4517 else
4518 {
4519 isOnlyFASTA = TRUE;
4520 }
4521 nextsep = SeqMgrGetSeqEntryForData (bsp);
4522 if (chars_stripped != NULL)
4523 {
4524 *chars_stripped |= this_chars_stripped;
4525 }
4526 read_from_delta = FALSE;
4527 }
4528 else
4529 {
4530 nextsep = SeqMgrGetSeqEntryForData (bsp);
4531 lastchar = '\n';
4532 read_from_delta = TRUE;
4533 }
4534 }
4535
4536 if ((!allow_char_stripping && this_chars_stripped) || (lastchar != (Char) EOF && lastchar != NULLB && lastchar != (Char) 255))
4537 {
4538 if (!this_chars_stripped && !isASN) {
4539 bad_start = FindLineForStartOfBadRead (fp, pos);
4540 bad_line = FindLineForBadReadChar (fp, lastchar);
4541 Message (MSG_ERROR, "Unable to read file, starting at line %d (found bad character '%c' at line %d)!", bad_start, lastchar, bad_line);
4542 }
4543 new_sep_list = SeqEntryFree (new_sep_list);
4544 }
4545 else
4546 {
4547 CleanTitles (new_sep_list, &special_list);
4548 if (!FixSpecialCharactersForStringsInList (special_list, "Definition lines contain special characters.\nThe sequences cannot be imported unless the characters are replaced.", FALSE))
4549 {
4550 new_sep_list = SeqEntryFree (new_sep_list);
4551 }
4552 special_list = FreeContextList (special_list);
4553 }
4554
4555 last = sep_list;
4556 while (last != NULL && last->next != NULL)
4557 {
4558 last = last->next;
4559 }
4560 if (last == NULL)
4561 {
4562 sep_list = new_sep_list;
4563 }
4564 else
4565 {
4566 last->next = new_sep_list;
4567 }
4568
4569 SeqEntrySetScope (oldscope);
4570
4571 return sep_list;
4572 }
4573
4574
4575 extern SeqEntryPtr
4576 ImportSequencesFromFile
4577 (FILE *fp,
4578 SeqEntryPtr sep_list,
4579 Boolean is_na,
4580 Boolean parse_id,
4581 CharPtr supplied_id_txt,
4582 ValNodePtr PNTR err_msg_list,
4583 BoolPtr chars_stripped)
4584 {
4585 return ImportSequencesFromFileEx (fp, sep_list, is_na, parse_id, supplied_id_txt, err_msg_list, chars_stripped, FALSE);
4586 }
4587
4588
4589 static Boolean CollectIDsAndTitles (SeqEntryPtr new_list, SeqEntryPtr current_list, Boolean is_nuc);
4590
4591 static SeqEntryPtr RemoveZeroLengthSequences (SeqEntryPtr list, Int4Ptr pnum_seqs, Int4Ptr pnum_zero)
4592 {
4593 SeqEntryPtr prev_sep, next_sep, this_sep;
4594 Int4 num_seqs = 0, num_zero = 0;
4595 BioseqPtr bsp;
4596 BioseqSetPtr bssp;
4597
4598 if (list == NULL)
4599 {
4600 return NULL;
4601 }
4602
4603 prev_sep = NULL;
4604 this_sep = list;
4605 while (this_sep != NULL)
4606 {
4607 num_seqs++;
4608 next_sep = this_sep->next;
4609 if (this_sep->data.ptrvalue == NULL)
4610 {
4611 num_zero++;
4612 if (prev_sep == NULL)
4613 {
4614 list = next_sep;
4615 }
4616 else
4617 {
4618 prev_sep->next = next_sep;
4619 }
4620 this_sep->next = NULL;
4621 SeqEntryFree (this_sep);
4622 }
4623 else if (IS_Bioseq (this_sep))
4624 {
4625 bsp = (BioseqPtr) this_sep->data.ptrvalue;
4626 if (bsp->length == 0)
4627 {
4628 num_zero++;
4629
4630 if (prev_sep == NULL)
4631 {
4632 list = next_sep;
4633 }
4634 else
4635 {
4636 prev_sep->next = next_sep;
4637 }
4638 this_sep->next = NULL;
4639 SeqEntryFree (this_sep);
4640 }
4641 else
4642 {
4643 prev_sep = this_sep;
4644 }
4645 }
4646 else if (IS_Bioseq_set (this_sep))
4647 {
4648 bssp = (BioseqSetPtr) this_sep->data.ptrvalue;
4649 bssp->seq_set = RemoveZeroLengthSequences (bssp->seq_set, pnum_seqs, pnum_zero);
4650 if (bssp->seq_set == NULL)
4651 {
4652 num_zero++;
4653 if (prev_sep == NULL)
4654 {
4655 list = next_sep;
4656 }
4657 else
4658 {
4659 prev_sep->next = next_sep;
4660 }
4661 this_sep->next = NULL;
4662 SeqEntryFree (this_sep);
4663 }
4664 else
4665 {
4666 prev_sep = this_sep;
4667 }
4668 }
4669 else
4670 {
4671 prev_sep = this_sep;
4672 }
4673 this_sep = next_sep;
4674 }
4675
4676 if (pnum_seqs != NULL)
4677 {
4678 *pnum_seqs += num_seqs;
4679 }
4680 if (pnum_zero != NULL)
4681 {
4682 *pnum_zero += num_zero;
4683 }
4684 return list;
4685 }
4686
4687 static Boolean RejectZeroLengthSequences (SeqEntryPtr PNTR new_list)
4688 {
4689 SeqEntryPtr next_sep;
4690 Int4 num_zero = 0, num_seq = 0;
4691 Boolean rval = TRUE;
4692 Boolean delete_all = FALSE;
4693
4694 if (new_list == NULL)
4695 {
4696 return FALSE;
4697 }
4698
4699 *new_list = RemoveZeroLengthSequences (*new_list, &num_seq, &num_zero);
4700
4701 if (num_zero > 0)
4702 {
4703 ResetSegSetIDLists (*new_list);
4704 if (num_zero == num_seq)
4705 {
4706 Message (MSG_ERROR, "The sequences in your file are empty - you cannot import them.");
4707 delete_all = TRUE;
4708 rval = FALSE;
4709 }
4710 else if (ANS_CANCEL == Message (MSG_OKC, "%d sequences in your file are empty and cannot be imported. "
4711 "Would you like to import the remaining sequences?", num_zero))
4712 {
4713 delete_all = TRUE;
4714 rval = FALSE;
4715 }
4716 if (delete_all)
4717 {
4718
4719 while ((*new_list) != NULL)
4720 {
4721 next_sep = (*new_list)->next;
4722 (*new_list)->next = NULL;
4723 SeqEntryFree (*new_list);
4724 *new_list = next_sep;
4725 }
4726 }
4727 }
4728 return rval;
4729 }
4730
4731 static Boolean RejectExtraSequences (SeqEntryPtr new_list, FastaPagePtr fpp)
4732 {
4733 SeqEntryPtr sep, next_sep;
4734
4735 if (new_list == NULL || fpp == NULL)
4736 {
4737 return FALSE;
4738 }
4739 else if (!fpp->single || new_list->next == NULL)
4740 {
4741 return TRUE;
4742 }
4743
4744 if (fpp->is_na
4745 && fpp->seqPackagePtr != NULL
4746 && *(fpp->seqPackagePtr) != SEQ_PKG_GENOMICCDNA)
4747 {
4748 if (Message (MSG_YN, "You are importing multiple sequences - did you intend to create a batch submission?") == ANS_YES)
4749 {
4750 *(fpp->seqPackagePtr) = SEQ_PKG_GENBANK;
4751 fpp->single = FALSE;
4752 SafeHide (fpp->singleIdGrp);
4753 return TRUE;
4754 }
4755 }
4756 if (Message (MSG_YN, "You cannot import multiple sequences - import the first one and ignore the rest?") == ANS_YES)
4757 {
4758 sep = new_list->next;
4759 new_list->next = NULL;
4760 while (sep != NULL)
4761 {
4762 next_sep = sep->next;
4763 sep->next = NULL;
4764 sep = SeqEntryFree (sep);
4765 sep = next_sep;
4766 }
4767 return TRUE;
4768 }
4769 else
4770 {
4771 return FALSE;
4772 }
4773 }
4774
4775 static void ShowImportHelp (ButtoN b)
4776 {
4777 CharPtr help_msg;
4778
4779 help_msg = (CharPtr) GetObjectExtra (b);
4780 if (help_msg == NULL)
4781 {
4782 return;
4783 }
4784
4785 Message (MSG_OK, help_msg);
4786 }
4787
4788 static Boolean OkToImport (CharPtr msg, CharPtr help_msg)
4789 {
4790 WindoW w;
4791 GrouP h, c;
4792 PrompT p;
4793 ButtoN b;
4794 ModalAcceptCancelData acd;
4795
4796 if (msg == NULL)
4797 {
4798 return TRUE;
4799 }
4800 acd.accepted = FALSE;
4801 acd.cancelled = FALSE;
4802
4803 w = ModalWindow(-20, -13, -10, -10, NULL);
4804 h = HiddenGroup (w, -1, 0, NULL);
4805
4806 p = StaticPrompt (h, msg, 0, 0, programFont, 'l');
4807 c = HiddenGroup (h, 3, 0, NULL);
4808 b = PushButton (c, "Yes", ModalAcceptButton);
4809 SetObjectExtra (b, &acd, NULL);
4810 b = PushButton (c, "No", ModalCancelButton);
4811 SetObjectExtra (b, &acd, NULL);
4812 if (help_msg != NULL)
4813 {
4814 b = PushButton (c, "Help", ShowImportHelp);
4815 SetObjectExtra (b, help_msg, NULL);
4816 }
4817 AlignObjects (ALIGN_CENTER, (HANDLE) p, (HANDLE) c, NULL);
4818
4819 Show(w);
4820 Select (w);
4821 while (!acd.accepted && ! acd.cancelled)
4822 {
4823 ProcessExternalEvent ();
4824 Update ();
4825 }
4826 ProcessAnEvent ();
4827 Remove (w);
4828 if (acd.accepted)
4829 {
4830 return TRUE;
4831 }
4832 else
4833 {
4834 return FALSE;
4835 }
4836 }
4837
4838 static CharPtr segset_import_help_str = "Segmented sequence: a collection of non-overlapping, non-contiguous sequences that cover a specified genetic region. A standard example is a set of genomic DNA sequences that encode exons from a gene along with fragments of their flanking introns.";
4839 static CharPtr gapped_import_help_str = "Gapped sequence: a sequence with one or more gaps of known or unknown length.";
4840
4841
4842 static Boolean ImportedSequenceTypeOk (SeqEntryPtr list, Int2 seqPackage)
4843 {
4844 BioseqPtr bsp;
4845 Boolean rval = TRUE;
4846
4847 if (list == NULL || seqPackage != SEQ_PKG_SINGLE)
4848 {
4849 return TRUE;
4850 }
4851 if (list->choice == 1)
4852 {
4853 bsp = (BioseqPtr) list->data.ptrvalue;
4854 if (bsp != NULL && bsp->repr == Seq_repr_delta)
4855 {
4856 SendHelpScrollMessage (helpForm, "Sequence Format Form", NULL);
4857 rval = OkToImport ("You have imported a gapped sequence. Did you mean to do that?",
4858 gapped_import_help_str);
4859 }
4860 }
4861 else if (list->choice == 2)
4862 {
4863 SendHelpScrollMessage (helpForm, "Sequence Format Form", NULL);
4864 rval = OkToImport ("You have imported a segmented sequence. Did you mean to do that?",
4865 segset_import_help_str);
4866 }
4867 return rval;
4868 }
4869
4870 static Boolean ImportFastaDialog (DialoG d, CharPtr filename)
4871
4872 {
4873 CharPtr extension;
4874 FILE *f;
4875 FastaPagePtr fpp;
4876 ValNodePtr head;
4877 Char path [PATH_MAX];
4878 RecT r;
4879 SeqEntryPtr sep, new_sep_list, new_sep, test_sep;
4880 Boolean rval = FALSE;
4881 BioseqPtr bsp;
4882 CharPtr supplied_id_txt = NULL;
4883 Boolean chars_stripped = FALSE;
4884
4885 path [0] = '\0';
4886 StringNCpy_0 (path, filename, sizeof (path));
4887 fpp = (FastaPagePtr) GetObjectExtra (d);
4888 if (fpp != NULL) {
4889 if (fpp->list != NULL && fpp->single)
4890 {
4891 if (!fpp->is_na
4892 || fpp->seqPackagePtr == NULL
4893 || *fpp->seqPackagePtr == SEQ_PKG_GENOMICCDNA)
4894 {
4895 Message (MSG_ERROR, "Can't import additional sequences!");
4896 return FALSE;
4897 }
4898 else
4899 {
4900 if (Message (MSG_YN, "You are importing multiple sequences - did you intend to create a batch submission?") == ANS_NO)
4901 {
4902 Message (MSG_ERROR, "Can't import additional sequences!");
4903 return FALSE;
4904 }
4905 else
4906 {
4907 *(fpp->seqPackagePtr) = SEQ_PKG_GENBANK;
4908 fpp->single = FALSE;
4909 SafeHide (fpp->singleIdGrp);
4910 }
4911 }
4912 }
4913 extension = NULL;
4914 if (fpp->is_mrna) {
4915 extension = GetAppProperty ("FastaNucExtension");
4916 } else if (fpp->is_na) {
4917 extension = GetAppProperty ("FastaNucExtension");
4918 } else {
4919 extension = GetAppProperty ("FastaProtExtension");
4920 }
4921 if (path [0] != '\0' || GetInputFileName (path, sizeof (path), extension, "TEXT")) {
4922 WatchCursor ();
4923 StringCpy (fpp->path, path);
4924 ObjectRect (fpp->doc, &r);
4925 InsetRect (&r, 4, 4);
4926 faColFmt.pixWidth = r.right - r.left;
4927 /*
4928 ResetFastaPage (fpp);
4929 */
4930 Reset (fpp->doc);
4931 Update ();
4932 sep = fpp->list;
4933 head = fpp->errmsgs;
4934 f = FileOpen (fpp->path, "r");
4935 if (f == NULL)
4936 {
4937 Message (MSG_ERROR, "Unable to open %s", fpp->path);
4938 fpp->path[0] = 0;
4939 }
4940 else
4941 {
4942 if (fpp->singleSeqID != NULL)
4943 {
4944 supplied_id_txt = SaveStringFromText (fpp->singleSeqID);
4945 }
4946 new_sep_list = ImportSequencesFromFile (f, NULL, fpp->is_na,
4947 fpp->parseSeqId,
4948 supplied_id_txt,
4949 &head, &chars_stripped);
4950 if (chars_stripped && new_sep_list != NULL)
4951 {
4952 if (ANS_CANCEL == Message (MSG_OKC, "Illegal characters will be stripped from your sequence data. Do you want to continue?"))
4953 {
4954 new_sep_list = SeqEntryFree (new_sep_list);
4955 FileClose (f);
4956 fpp->path [0] = 0;
4957 ArrowCursor ();
4958 Update ();
4959 return FALSE;
4960 }
4961 }
4962 supplied_id_txt = MemFree (supplied_id_txt);
4963 if (fpp->seqPackagePtr != NULL
4964 && *(fpp->seqPackagePtr) == SEQ_PKG_SEGMENTED
4965 && new_sep_list != NULL
4966 && IS_Bioseq (new_sep_list))
4967 {
4968 new_sep_list = SegsetFromSeqEntryList (new_sep_list);
4969 }
4970 FileClose (f);
4971
4972 if (new_sep_list != NULL
4973 && new_sep_list->next == NULL
4974 && fpp->single
4975 && fpp->list == NULL
4976 && fpp->is_na
4977 && new_sep_list->choice == 1
4978 && new_sep_list->data.ptrvalue != NULL)
4979 {
4980 bsp = (BioseqPtr) new_sep_list->data.ptrvalue;
4981
4982 /* assign a fake ID if there is only one sequence being imported,
4983 * the package type is single, and there are no other sequences
4984 * from previous imports.
4985 */
4986
4987 if (bsp->id == NULL)
4988 {
4989 bsp->id = MakeSeqID ("nuc_1");
4990 }
4991 }
4992
4993 if (new_sep_list == NULL)
4994 {
4995 Message (MSG_ERROR, "Unable to read sequences from %s", fpp->path);
4996 fpp->path [0] = 0;
4997 }
4998 else if (! RejectZeroLengthSequences (&new_sep_list))
4999 {
5000 fpp->path [0] = 0;
5001 }
5002 else if (! RejectExtraSequences (new_sep_list, fpp))
5003 {
5004 /* if unsuccessful, delete new list */
5005 new_sep = new_sep_list;
5006 while (new_sep != NULL)
5007 {
5008 test_sep = new_sep->next;
5009 SeqEntryFree (new_sep);
5010 new_sep = test_sep;
5011 }
5012 fpp->path [0] = 0;
5013 }
5014 else if (fpp->seqPackagePtr != NULL
5015 && ! ImportedSequenceTypeOk (new_sep_list, *(fpp->seqPackagePtr)))
5016 {
5017 /* if unsuccessful, delete new list */
5018 new_sep = new_sep_list;
5019 while (new_sep != NULL)
5020 {
5021 test_sep = new_sep->next;
5022 SeqEntryFree (new_sep);
5023 new_sep = test_sep;
5024 }
5025 fpp->path [0] = 0;
5026 }
5027 else if (CollectIDsAndTitles (new_sep_list, fpp->list, (fpp->is_na && ! fpp->is_mrna)))
5028 {
5029 if (fpp->is_na)
5030 {
5031 /* add default molecule type, topology, location, and genetic codes */
5032 AddDefaultModifierValues (new_sep_list);
5033 }
5034
5035 /* if successful, link old and new lists */
5036 ValNodeLink (&(fpp->list), new_sep_list);
5037 rval = TRUE;
5038 }
5039 else
5040 {
5041 /* if unsuccessful, delete new list */
5042 new_sep = new_sep_list;
5043 while (new_sep != NULL)
5044 {
5045 test_sep = new_sep->next;
5046 SeqEntryFree (new_sep);
5047 new_sep = test_sep;
5048 }
5049 fpp->path [0] = 0;
5050 }
5051 }
5052
5053 if (fpp->list == NULL)
5054 {
5055 SafeHide (fpp->have_seq_instr_grp);
5056 Reset (fpp->doc);
5057 SafeShow (fpp->instructions);
5058 Update ();
5059 SetTitle (fpp->import_btn, "Import Nucleotide FASTA");
5060 Enable (fpp->import_btn);
5061 Disable (fpp->clear_btn);
5062 }
5063 else
5064 {
5065 SafeHide (fpp->instructions);
5066 Update ();
5067 if (! fpp->is_na || fpp->single
5068 || fpp->seqPackagePtr == NULL
5069 || *fpp->seqPackagePtr == SEQ_PKG_GENOMICCDNA)
5070 {
5071 Disable (fpp->import_btn);
5072 }
5073 else
5074 {
5075 Enable (fpp->import_btn);
5076 SetTitle (fpp->import_btn, "Import Additional Nucleotide FASTA");
5077 }
5078 Enable (fpp->clear_btn);
5079 FormatFastaDoc (fpp);
5080 SafeShow (fpp->have_seq_instr_grp);
5081 }
5082 ArrowCursor ();
5083 Update ();
5084 return rval;
5085 }
5086 }
5087 return FALSE;
5088 }
5089
5090 #define EXPORT_PAGE_WIDTH 80
5091
5092 static void ExportSeqIdAndTitle (SeqIdPtr sip, CharPtr title, FILE *fp)
5093 {
5094 CharPtr id_str = NULL;
5095
5096 if (fp == NULL)
5097 {
5098 return;
5099 }
5100
5101 id_str [0] = 0;
5102 if (sip == NULL)
5103 {
5104 id_str = StringSave ("unknown_id");
5105 }
5106 else
5107 {
5108 id_str = SeqIdWholeLabel (sip, PRINTID_REPORT);
5109 }
5110
5111 if (StringCSpn (id_str, " \t") == StringLen (id_str))
5112 {
5113 fprintf (fp, ">%s %s\n", id_str, title == NULL ? "" : title);
5114 }
5115 else
5116 {
5117 fprintf (fp, ">'%s' %s\n", id_str, title == NULL ? "" : title);
5118 }
5119 id_str = MemFree (id_str);
5120 }
5121
5122 static void ExportSeqPort (Int4 from, Int4 to, SeqPortPtr spp, FILE *fp)
5123 {
5124 Char buffer [EXPORT_PAGE_WIDTH + 1];
5125 Int4 seq_offset, txt_out;
5126
5127 if (spp == NULL || fp == NULL || from < 0 || to <= from)
5128 {
5129 return;
5130 }
5131
5132 seq_offset = from;
5133 while (seq_offset < to)
5134 {
5135 txt_out = ReadBufferFromSep (spp, buffer, seq_offset,
5136 MIN (seq_offset + EXPORT_PAGE_WIDTH, to), 0);
5137 if (txt_out == 0) break;
5138 seq_offset += txt_out;
5139 fprintf(fp, "%s\n", buffer);
5140 }
5141
5142 }
5143
5144 static void ExportOneRawSequence (BioseqPtr bsp, CharPtr title_master, FILE *fp)
5145 {
5146 SeqDescrPtr sdp;
5147 Char buffer [EXPORT_PAGE_WIDTH + 1];
5148 SeqPortPtr spp;
5149 CharPtr title = NULL;
5150 CharPtr combined_title = NULL;
5151 Boolean free_combined = FALSE;
5152
5153 if (bsp == NULL || fp == NULL || bsp->repr != Seq_repr_raw)
5154 {
5155 return;
5156 }
5157
5158 sdp = bsp->descr;
5159 while (sdp != NULL && sdp->choice != Seq_descr_title)
5160 {
5161 sdp = sdp->next;
5162 }
5163 if (sdp != NULL)
5164 {
5165 title = sdp->data.ptrvalue;
5166 }
5167
5168 if (StringHasNoText (title_master))
5169 {
5170 combined_title = title;
5171 }
5172 else if (StringHasNoText (title))
5173 {
5174 combined_title = title_master;
5175 }
5176 else
5177 {
5178 combined_title = (CharPtr) MemNew ((StringLen (title_master) + StringLen (title) + 2) * sizeof (Char));
5179 if (combined_title != NULL)
5180 {
5181 StringCpy (combined_title, title_master);
5182 StringCat (combined_title, " ");
5183 StringCat (combined_title, title);
5184 free_combined = TRUE;
5185 }
5186 }
5187
5188 ExportSeqIdAndTitle (bsp->id, combined_title, fp);
5189 if (free_combined)
5190 {
5191 combined_title = MemFree (combined_title);
5192 }
5193
5194 buffer [EXPORT_PAGE_WIDTH] = 0;
5195
5196 spp = SeqPortNew (bsp, 0, bsp->length-1, Seq_strand_plus, Seq_code_iupacna);
5197
5198 ExportSeqPort (0, bsp->length, spp, fp);
5199
5200 SeqPortFree (spp);
5201 fprintf (fp, "\n");
5202 }
5203
5204 static void ExportOneSegmentedBioseq (BioseqPtr bsp, FILE *fp)
5205 {
5206 SeqLocPtr slp;
5207 BioseqPtr bsp_seg;
5208 SeqDescrPtr sdp;
5209 CharPtr title = NULL;
5210
5211 if (bsp == NULL || fp == NULL || bsp->repr != Seq_repr_seg)
5212 {
5213 return;
5214 }
5215
5216 fprintf (fp, "[\n");
5217
5218 sdp = bsp->descr;
5219 while (sdp != NULL && sdp->choice != Seq_descr_title)
5220 {
5221 sdp = sdp->next;
5222 }
5223 if (sdp != NULL)
5224 {
5225 title = sdp->data.ptrvalue;
5226 }
5227
5228 slp = (SeqLocPtr) bsp->seq_ext;
5229 while (slp != NULL)
5230 {
5231 bsp_seg = BioseqFind (SeqLocId (slp));
5232 ExportOneRawSequence (bsp_seg, title, fp);
5233 title = NULL;
5234 slp = slp->next;
5235 }
5236 fprintf (fp, "]\n\n");
5237 }
5238
5239 static Boolean ExportOneDeltaBioseq (BioseqPtr bsp, FILE *fp)
5240 {
5241 SeqDescrPtr sdp;
5242 CharPtr title = NULL;
5243 DeltaSeqPtr dsp;
5244 SeqLitPtr slip;
5245 SeqPortPtr spp;
5246 Char buffer [EXPORT_PAGE_WIDTH + 1];
5247 Int4 seq_offset;
5248
5249 if (bsp == NULL || fp == NULL || bsp->repr != Seq_repr_delta
5250 || bsp->seq_ext_type != 4 || bsp->seq_ext == NULL)
5251 {
5252 return FALSE;
5253 }
5254
5255 dsp = (DeltaSeqPtr) bsp->seq_ext;
5256 while (dsp != NULL)
5257 {
5258 if (dsp->data.ptrvalue == NULL || dsp->choice != 2)
5259 {
5260 Message (MSG_ERROR, "Can't export badly formed delta sequence!");
5261 return FALSE;
5262 }
5263 dsp = dsp->next;
5264 }
5265
5266 sdp = bsp->descr;
5267 while (sdp != NULL && sdp->choice != Seq_descr_title)
5268 {
5269 sdp = sdp->next;
5270 }
5271 if (sdp != NULL)
5272 {
5273 title = sdp->data.ptrvalue;
5274 }
5275
5276 ExportSeqIdAndTitle (bsp->id, title, fp);
5277
5278 buffer [EXPORT_PAGE_WIDTH] = 0;
5279
5280 spp = SeqPortNew (bsp, 0, bsp->length-1, Seq_strand_plus, Seq_code_iupacna);
5281
5282 seq_offset = 0;
5283 dsp = (DeltaSeqPtr) bsp->seq_ext;
5284 while (dsp != NULL)
5285 {
5286 slip = (SeqLitPtr) (dsp->data.ptrvalue);
5287 if (IsDeltaSeqGap(dsp))
5288 {
5289 if (IsDeltaSeqUnknownGap (dsp))
5290 {
5291 fprintf (fp, ">?unk100\n");
5292 }
5293 else
5294 {
5295 fprintf (fp, ">?%d\n", slip->length);
5296 }
5297 }
5298 else
5299 {
5300 ExportSeqPort (seq_offset, seq_offset + slip->length, spp, fp);
5301 }
5302 seq_offset += slip->length;
5303 dsp = dsp->next;
5304 }
5305 fprintf (fp, "\n");
5306 return TRUE;
5307 }
5308
5309 static void ExportFASTASeqEntryList (SeqEntryPtr sep, FILE *fp)
5310 {
5311 BioseqPtr bsp;
5312 BioseqSetPtr bssp;
5313
5314 if (sep == NULL || sep->data.ptrvalue == NULL || fp == NULL)
5315 {
5316 return;
5317 }
5318
5319 if (IS_Bioseq (sep))
5320 {
5321 bsp = (BioseqPtr) sep->data.ptrvalue;
5322 if (ISA_na (bsp->mol))
5323 {
5324 if (bsp->repr == Seq_repr_raw)
5325 {
5326 if (SeqMgrGetParentOfPart (bsp, NULL) == NULL)
5327 {
5328 ExportOneRawSequence (bsp, NULL, fp);
5329 }
5330 }
5331 else if (bsp->repr == Seq_repr_seg)
5332 {
5333 ExportOneSegmentedBioseq (bsp, fp);
5334 }
5335 else if (bsp->repr == Seq_repr_delta)
5336 {
5337 ExportOneDeltaBioseq (bsp, fp);
5338 }
5339 }
5340 }
5341 else if (IS_Bioseq_set (sep))
5342 {
5343 bssp = (BioseqSetPtr) sep->data.ptrvalue;
5344 /* we don't export the parts set because we export them
5345 * when we do the master segment
5346 */
5347 if (bssp->_class != BioseqseqSet_class_parts)
5348 {
5349 ExportFASTASeqEntryList (bssp->seq_set, fp);
5350 }
5351 }
5352 ExportFASTASeqEntryList (sep->next, fp);
5353 }
5354
5355 static Boolean ExportNucleotideFASTADialog (DialoG d, CharPtr filename)
5356 {
5357 CharPtr extension;
5358 FILE *f;
5359 FastaPagePtr fpp;
5360 Char path [PATH_MAX];
5361 Boolean rval = FALSE;
5362
5363 fpp = (FastaPagePtr) GetObjectExtra (d);
5364 if (fpp == NULL) {
5365 return FALSE;
5366 }
5367
5368 path [0] = '\0';
5369 StringNCpy_0 (path, filename, sizeof (path));
5370
5371 extension = NULL;
5372 if (fpp->is_mrna) {
5373 extension = GetAppProperty ("FastaNucExtension");
5374 } else if (fpp->is_na) {
5375 extension = GetAppProperty ("FastaNucExtension");
5376 } else {
5377 extension = GetAppProperty ("FastaProtExtension");
5378 }
5379 if (path [0] != '\0' || GetOutputFileName (path, sizeof (path), extension)) {
5380 f = FileOpen (path, "w");
5381 if (f == NULL)
5382 {
5383 Message (MSG_ERROR, "Unable to open %s", path);
5384 }
5385 else
5386 {
5387 WatchCursor ();
5388 ExportFASTASeqEntryList (fpp->list, f);
5389 FileClose (f);
5390
5391 ArrowCursor ();
5392 Update ();
5393 rval = TRUE;
5394 }
5395 }
5396 return rval;
5397 }
5398
5399 static void CleanupFastaDialog (GraphiC g, VoidPtr data)
5400
5401 {
5402 FastaPagePtr fpp;
5403
5404 fpp = (FastaPagePtr) data;
5405 if (fpp != NULL) {
5406 ResetFastaPage (fpp);
5407 }
5408 MemFree (data);
5409 }
5410
5411 static CharPtr fastaNucMsg = "\
5412 \nClick on 'Import Nucleotide FASTA' to read a formatted FASTA file \
5413 or 'Add/Modify Sequences' to create the file here. The FASTA definition \
5414 line must be in the following form:\n\n\
5415 >SeqID [organism=scientific name]\n\n\
5416 where the [ and ] brackets are actually in the text.\n\
5417 Properly formatted modifiers and a title can also be included in the FASTA definition line.";
5418
5419
5420 static CharPtr fastaGenMsg = "\
5421 \nPlease enter information about the genomic \
5422 sequence in the spaces above. Then click on either \
5423 'Add/Modify Sequences' to create your sequences with the editor or \
5424 'Import Genomic FASTA' to read a previously generated FASTA file that \
5425 contains the sequence (which can be in segments). The \
5426 FASTA definition lines may be of the following form:\n\n\
5427 >ID [organism=scientific name] [strain=name] [clone=name] title\n\n\
5428 where the [ and ] brackets are actually in the text.";
5429
5430 static CharPtr fastaMrnaMsg = "\
5431 \nPlease enter information about the transcript \
5432 sequences in the spaces above. Then click on \
5433 'Import Transcript FASTA' to read a FASTA file that \
5434 contains the sequence (which can be in segments). The \
5435 FASTA definition lines may be of the following form:\n\n\
5436 >ID [gene=symbol] [mrna=name] title\n\n\
5437 where the [ and ] brackets are actually in the text.";
5438
5439 static CharPtr fastaProtMsg = "\
5440 \nPlease enter information about the protein \
5441 sequences in the spaces above. Then click on \
5442 'Import Protein FASTA' to read a FASTA file that \
5443 contains the sequences. The FASTA definition lines should \
5444 be of the following form:\n\n\
5445 >ID [gene=symbol] [protein=name] title\n\n\
5446 where the [ and ] brackets are actually in the text.";
5447
5448 static CharPtr GetFastaSettingName (FastaPagePtr fpp)
5449 {
5450 if (fpp == NULL)
5451 {
5452 return NULL;
5453 }
5454 else if (fpp->is_mrna)
5455 {
5456 return "PARSEMRNASEQID";
5457 }
5458 else if (fpp->is_na)
5459 {
5460 return "PARSENUCSEQID";
5461 }
5462 else
5463 {
5464 return "PARSEPROTSEQID";
5465 }
5466 }
5467
5468 static void ChangeIDParse (ButtoN b)
5469 {
5470 FastaPagePtr fpp;
5471 CharPtr setting_name;
5472
5473 fpp = (FastaPagePtr) GetObjectExtra (b);
5474 if (fpp != NULL) {
5475 fpp->parseSeqId = GetStatus (b);
5476
5477 setting_name = GetFastaSettingName (fpp);
5478
5479 if (fpp->parseSeqId) {
5480 SetAppParam ("SEQUINCUSTOM", "PREFERENCES", setting_name, "TRUE");
5481 SafeHide (fpp->singleIdGrp);
5482 } else {
5483 SetAppParam ("SEQUINCUSTOM", "PREFERENCES", setting_name, "FALSE");
5484 if (fpp->single)
5485 {
5486 SafeShow (fpp->singleIdGrp);
5487 }
5488 else
5489 {
5490 SafeHide (fpp->singleIdGrp);
5491 }
5492 }
5493 }
5494 }
5495
5496 extern DialoG CreateFastaDialog (GrouP h, CharPtr title,
5497 Boolean is_na, Boolean is_mrna, CharPtr text,
5498 Boolean single, Int2Ptr seqPackagePtr)
5499
5500 {
5501 FastaPagePtr fpp;
5502 GrouP g;
5503 GrouP m;
5504 GrouP p;
5505 GrouP s;
5506 PrompT pr;
5507 CharPtr setting_name;
5508 ButtoN prs = NULL;
5509 Char str [32];
5510 Boolean parseSeqId;
5511 #ifdef WIN_MAC
5512 Int2 wid = 25;
5513 #else
5514 Int2 wid = 33;
5515 #endif
5516
5517 p = HiddenGroup (h, 1, 0, NULL);
5518 SetGroupSpacing (p, 10, 10);
5519
5520 fpp = (FastaPagePtr) MemNew (sizeof (FastaPage));
5521 if (fpp != NULL) {
5522
5523 SetObjectExtra (p, fpp, CleanupFastaDialog);
5524 fpp->dialog = (DialoG) p;
5525 fpp->todialog = NULL;
5526 fpp->fromdialog = NULL;
5527 fpp->importdialog = ImportFastaDialog;
5528 if (is_na)
5529 {
5530 fpp->exportdialog = ExportNucleotideFASTADialog;
5531 }
5532 else
5533 {
5534 fpp->exportdialog = NULL;
5535 }
5536
5537 fpp->seqPackagePtr = seqPackagePtr;
5538 if (title != NULL && title [0] != '\0') {
5539 s = NormalGroup (p, 0, -2, title, systemFont, NULL);
5540 } else {
5541 s = HiddenGroup (p, 0, -2, NULL);
5542 }
5543 m = HiddenGroup (s, -1, 0, NULL);
5544
5545 fpp->path [0] = '\0';
5546 fpp->is_na = is_na;
5547 fpp->is_mrna = is_mrna;
5548 fpp->single = single;
5549
5550 setting_name = GetFastaSettingName (fpp);
5551
5552 if (GetAppParam ("SEQUINCUSTOM", "SETTINGS", "ALLOWNOSEQID", NULL, str, sizeof (str))
5553 && StringICmp (str, "TRUE") == 0)
5554 {
5555 prs = CheckBox (m, "Fasta definition line starts with sequence ID", ChangeIDParse);
5556 SetObjectExtra (prs, fpp, NULL);
5557 }
5558 parseSeqId = FALSE;
5559 if (GetAppParam ("SEQUINCUSTOM", "PREFERENCES", setting_name, NULL, str, sizeof (str))) {
5560 if (StringICmp (str, "TRUE") == 0) {
5561 parseSeqId = TRUE;
5562 }
5563 }
5564 else
5565 {
5566 parseSeqId = TRUE;
5567 }
5568 SetStatus (prs, parseSeqId);
5569
5570 fpp->parseSeqId = parseSeqId;
5571 if (fpp->single) {
5572 fpp->singleIdGrp = HiddenGroup (m, 2, 0, NULL);
5573 StaticPrompt (fpp->singleIdGrp, "Enter unique identifier for this sequence", 0, dialogTextHeight, programFont, 'l');
5574 fpp->singleSeqID = DialogText (fpp->singleIdGrp, "", 6, NULL);
5575 if (parseSeqId) {
5576 Hide (fpp->singleIdGrp);
5577 }
5578 }
5579
5580 g = HiddenGroup (m, 0, 0, NULL);
5581 fpp->instructions = MultiLinePrompt (g, text, 27 * stdCharWidth, programFont);
5582 fpp->have_seq_instr_grp = HiddenGroup (g, -1, 0, NULL);
5583 SetGroupSpacing (fpp->have_seq_instr_grp, 10, 10);
5584 fpp->doc = DocumentPanel (fpp->have_seq_instr_grp, stdCharWidth * wid, stdLineHeight * 12);
5585 SetDocAutoAdjust (fpp->doc, FALSE);
5586 pr = StaticPrompt (fpp->have_seq_instr_grp, "Choose Clear from the Edit menu to clear these sequences", 0, dialogTextHeight, systemFont, 'c');
5587 AlignObjects (ALIGN_CENTER, (HANDLE) fpp->doc, (HANDLE) pr, NULL);
5588 Hide (fpp->have_seq_instr_grp);
5589 AlignObjects (ALIGN_CENTER, (HANDLE) fpp->instructions,
5590 (HANDLE) fpp->have_seq_instr_grp, NULL);
5591
5592 AlignObjects (ALIGN_CENTER, (HANDLE) g,
5593 (HANDLE) prs,
5594 (HANDLE) fpp->singleIdGrp,
5595 NULL);
5596 }
5597
5598 return (DialoG) p;
5599 }
5600
5601 typedef struct phylippage {
5602 DIALOG_MESSAGE_BLOCK
5603 Uint1 format;
5604 Char path [PATH_MAX];
5605 SeqEntryPtr sep;
5606 ValNodePtr errmsgs;
5607 DoC doc;
5608 GrouP instructions;
5609 Char extension [10];
5610 Int4 type;
5611 TSequenceInfoPtr aln_settings;
5612
5613 } PhylipPage, PNTR PhylipPagePtr;
5614
5615
5616 #define PhylipFormatBufLen 1000
5617
5618 static void FormatPhylipDoc (PhylipPagePtr ppp)
5619
5620 {
5621 Nlm_QualNameAssocPtr ap;
5622 BioseqPtr bsp;
5623 BioseqSetPtr bssp;
5624 CharPtr label;
5625 Int4 len;
5626 CharPtr measure;
5627 SeqEntryPtr nsep;
5628 Int2 num;
5629 CharPtr plural;
5630 SeqIdPtr sip;
5631 SeqEntryPtr sep;
5632 CharPtr str;
5633 CharPtr title;
5634 CharPtr ttl;
5635 CharPtr tmp;
5636 CharPtr valstr;
5637 ValNodePtr vnp;
5638
5639 if (ppp != NULL) {
5640 str = MemNew (sizeof (char) * PhylipFormatBufLen);
5641 tmp = MemNew (sizeof (char) * PhylipFormatBufLen);
5642 if (str == NULL || tmp == NULL) return;
5643 num = 0;
5644 len = 0;
5645 sep = ppp->sep;
5646 if (sep != NULL && IS_Bioseq_set (sep)) {
5647 bssp = (BioseqSetPtr) sep->data.ptrvalue;
5648 if (bssp != NULL && (bssp->_class == 7 ||
5649 (IsPopPhyEtcSet (bssp->_class)))) {
5650 for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5651 num++;
5652 if (IS_Bioseq (sep)) {
5653 bsp = (BioseqPtr) sep->data.ptrvalue;
5654 if (bsp != NULL) {
5655 len += bsp->length;
5656 }
5657 } else if (IS_Bioseq_set (sep)) {
5658 nsep = FindNucSeqEntry (sep);
5659 if (nsep != NULL && IS_Bioseq (nsep)) {
5660 bsp = (BioseqPtr) nsep->data.ptrvalue;
5661 if (bsp != NULL) {
5662 len += bsp->length;
5663 }
5664 }
5665 }
5666 }
5667 }
5668 }
5669 if (num > 1) {
5670 plural = "s";
5671 } else {
5672 plural = "";
5673 }
5674 label = "Sequence";
5675 measure = "nucleotides";
5676 sprintf (str, "%d nucleotide sequence%s, total length %ld %s\n",
5677 (int) num, plural, (long) len, measure);
5678 AppendText (ppp->doc, str, &faParFmt, &faColFmt, programFont);
5679 vnp = ppp->errmsgs;
5680 num = 0;
5681 sep = ppp->sep;
5682 if (sep != NULL && IS_Bioseq_set (sep)) {
5683 bssp = (BioseqSetPtr) sep->data.ptrvalue;
5684 if (bssp != NULL && (bssp->_class == 7 ||
5685 (IsPopPhyEtcSet (bssp->_class)))) {
5686 for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5687 nsep = NULL;
5688 num++;
5689 len = 0;
5690 sip = NULL;
5691 tmp [0] = '\0';
5692 if (IS_Bioseq (sep)) {
5693 bsp = (BioseqPtr) sep->data.ptrvalue;
5694 if (bsp != NULL) {
5695 len = bsp->length;
5696 sip = SeqIdFindWorst (bsp->id);
5697 SeqIdWrite (sip, tmp, PRINTID_REPORT, FastaFormatBufLen);
5698 }
5699 } else if (IS_Bioseq_set (sep)) {
5700 nsep = FindNucSeqEntry (sep);
5701 if (nsep != NULL && IS_Bioseq (nsep)) {
5702 bsp = (BioseqPtr) nsep->data.ptrvalue;
5703 if (bsp != NULL) {
5704 len = bsp->length;
5705 sip = SeqIdFindWorst (bsp->id);
5706 SeqIdWrite (sip, tmp, PRINTID_REPORT, FastaFormatBufLen);
5707 }
5708 }
5709 }
5710 sprintf (str, "\n%s %d\nLength: %ld %s\nSequence ID: %s\n", label,
5711 (int) num, (long) len, measure, tmp);
5712 ttl = NULL;
5713 SeqEntryExplore (nsep, (Pointer) (&ttl), FindFirstTitle);
5714 title = StringSaveNoNull (ttl);
5715 if (title != NULL) {
5716 valstr = FindValueFromPairInDefline ("organism", title);
5717 if (!StringHasNoText (valstr)) {
5718 AddReportLine (str, "Organism", valstr);
5719 }
5720 valstr = MemFree (valstr);
5721 RemoveValueFromDefline ("organism", title);
5722
5723 valstr = FindValueFromPairInDefline ("lineage", title);
5724 if (!StringHasNoText (valstr)) {
5725 AddReportLine (str, "Lineage", valstr);
5726 }
5727 valstr = MemFree (valstr);
5728 RemoveValueFromDefline ("lineage", title);
5729
5730 for (ap = current_orgmod_subtype_alist; ap->name != NULL; ap++) {
5731 if (IsNonTextModifier (ap->name))
5732 {
5733 if (FindValuePairInDefLine (ap->name, title, NULL) != NULL)
5734 {
5735 AddReportLine (str, ap->name, "TRUE");
5736 RemoveValueFromDefline (ap->name, title);
5737 }
5738 }
5739 else
5740 {
5741 valstr = FindValueFromPairInDefline (ap->name, title);
5742 if (!StringHasNoText (valstr)) {
5743 AddReportLine (str, ap->name, title);
5744 }
5745 valstr = MemFree (valstr);
5746 RemoveValueFromDefline (ap->name, title);
5747 }
5748 }
5749 for (ap = current_subsource_subtype_alist; ap->name != NULL; ap++) {
5750 if (IsNonTextModifier (ap->name))
5751 {
5752 if (FindValuePairInDefLine (ap->name, title, NULL) != NULL)
5753 {
5754 AddReportLine (str, ap->name, "TRUE");
5755 RemoveValueFromDefline (ap->name, title);
5756 }
5757 }
5758 else
5759 {
5760 valstr = FindValueFromPairInDefline (ap->name, title);
5761 if (!StringHasNoText (valstr)) {
5762 AddReportLine (str, ap->name, title);
5763 }
5764 valstr = MemFree (valstr);
5765 RemoveValueFromDefline (ap->name, title);
5766 }
5767 }
5768
5769 valstr = FindValueFromPairInDefline ("note-orgmod", title);
5770 if (!StringHasNoText (valstr)) {
5771 AddReportLine (str, "Note", valstr);
5772 }
5773 valstr = MemFree (valstr);
5774 RemoveValueFromDefline ("note-orgmod", title);
5775
5776 valstr = FindValueFromPairInDefline ("note-subsrc", title);
5777 if (!StringHasNoText (valstr)) {
5778 AddReportLine (str, "Note", valstr);
5779 }
5780 valstr = MemFree (valstr);
5781 RemoveValueFromDefline ("note-subsrc", title);
5782
5783 valstr = FindValueFromPairInDefline ("molecule", title);
5784 if (!StringHasNoText (valstr)) {
5785 AddReportLine (str, "Molecule", valstr);
5786 }
5787 valstr = MemFree (valstr);
5788 RemoveValueFromDefline ("molecule", title);
5789
5790 valstr = FindValueFromPairInDefline ("moltype", title);
5791 if (!StringHasNoText (valstr)) {
5792 AddReportLine (str, "MolType", valstr);
5793 }
5794 valstr = MemFree (valstr);
5795 RemoveValueFromDefline ("moltype", title);
5796
5797 valstr = FindValueFromPairInDefline ("location", title);
5798 if (!StringHasNoText (valstr)) {
5799 AddReportLine (str, "Location", valstr);
5800 }
5801 valstr = MemFree (valstr);
5802 RemoveValueFromDefline ("location", valstr);
5803
5804 TrimSpacesAroundString (title);
5805 if (! StringHasNoText (title)) {
5806 StringCat (str, "Title: ");
5807 StringNCat (str, title, 128);
5808 StringCat (str, "\n");
5809 } else {
5810 StringCat (str, "No title detected\n");
5811 }
5812 }
5813 MemFree (title);
5814 if (vnp != NULL && vnp->data.ptrvalue != NULL) {
5815 StringCat (str, (CharPtr) vnp->data.ptrvalue);
5816 StringCat (str, "\n");
5817 }
5818 AppendText (ppp->doc, str, &faParFmt, &faColFmt, programFont);
5819 if (vnp != NULL) {
5820 vnp = vnp->next;
5821 }
5822 }
5823 }
5824 }
5825 MemFree (str);
5826 MemFree (tmp);
5827 UpdateDocument (ppp->doc, 0, 0);
5828 }
5829 }
5830
5831 static void ResetPhylipPage (PhylipPagePtr ppp)
5832
5833 {
5834 if (ppp != NULL) {
5835 ppp->sep = SeqEntryFree (ppp->sep);
5836 ppp->errmsgs = ValNodeFreeData (ppp->errmsgs);
5837 }
5838 }
5839
5840 static CharPtr noOrgInTitleWarning =
5841 "sequences have organism information in titles. " \
5842 "It is critical to annotate the data file with organism and source information. " \
5843 "Please quit Sequin and read the Sequin Quick Guide section on preparing the data files before proceeding.";
5844
5845 static void CountTitlesWithoutOrganisms (SeqEntryPtr sep)
5846 {
5847 IDAndTitleEditPtr iatep;
5848 Int4 seq_num;
5849 CharPtr org_name;
5850 Int4 num_sequences = 0, num_with_orgs = 0;
5851
5852 iatep = SeqEntryListToIDAndTitleEdit (sep);
5853 if (iatep == NULL)
5854 {
5855 return;
5856 }
5857
5858 for (seq_num = 0; seq_num < iatep->num_sequences; seq_num++)
5859 {
5860 if (iatep->is_seg != NULL && iatep->is_seg [seq_num])
5861 {
5862 continue;
5863 }
5864 num_sequences ++;
5865 org_name = FindValueFromPairInDefline ("organism", iatep->title_list [seq_num]);
5866 if (!StringHasNoText (org_name))
5867 {
5868 num_with_orgs ++;
5869 }
5870 org_name = MemFree (org_name);
5871 }
5872 iatep = IDAndTitleEditFree (iatep);
5873 if (num_sequences != num_with_orgs && num_with_orgs != 0)
5874 {
5875 Message (MSG_OK, "%d of %d %s", num_sequences - num_with_orgs, (int) num_sequences, noOrgInTitleWarning);
5876 }
5877
5878 }
5879
5880 static CharPtr phylipNucMsg = "\
5881 \nClick 'Import Nucleotide Alignment' to load your \
5882 nucleotide alignment file.\n\nClick on 'Custom Alignment Settings' \
5883 if Sequin has trouble reading your alignment file.";
5884
5885 static void SetPhylipDocInstructions (PhylipPagePtr ppp)
5886 {
5887 if (ppp == NULL || ppp->doc == NULL) return;
5888 Reset (ppp->doc);
5889 AppendText (ppp->doc, phylipNucMsg, &faParFmt, &faColFmt, programFont);
5890 UpdateDocument (ppp->doc, 0, 0);
5891 Update ();
5892 }
5893
5894 static Boolean ImportPhylipDialog (DialoG d, CharPtr filename)
5895 {
5896 Char path [PATH_MAX];
5897 PhylipPagePtr ppp;
5898 SeqEntryPtr sep;
5899 RecT r;
5900 FILE *fp;
5901 ObjMgrDataPtr omdptop;
5902 ObjMgrData omdata;
5903 Uint2 parenttype;
5904 Pointer parentptr;
5905 Char errStr [PATH_MAX + 64];
5906 CharPtr no_org_err_msg = NULL;
5907
5908 if (d == NULL || filename == NULL) return FALSE;
5909
5910 path [0] = '\0';
5911 StringNCpy_0 (path, filename, sizeof (path));
5912 ppp = (PhylipPagePtr) GetObjectExtra (d);
5913 if (ppp == NULL) {
5914 return FALSE;
5915 }
5916
5917 if (path [0] != '\0' || GetInputFileName (path, sizeof (path), ppp->extension, "TEXT")) {
5918 WatchCursor ();
5919 StringCpy (ppp->path, path);
5920 ObjectRect (ppp->doc, &r);
5921 InsetRect (&r, 4, 4);
5922 faColFmt.pixWidth = r.right - r.left;
5923 Reset (ppp->doc);
5924 Update ();
5925 ppp->sep = SeqEntryFree (ppp->sep);
5926 fp = FileOpen (path, "r");
5927 if (fp != NULL) {
5928 ppp->sep = SeqEntryFromAlignmentFile (fp, ppp->aln_settings,
5929 Seq_mol_na, no_org_err_msg);
5930
5931 /* check for bracketing issues here */
5932 if (CollectIDsAndTitles (ppp->sep, NULL, TRUE))
5933 {
5934 /* add default molecule type, topology, location, and genetic codes */
5935 AddDefaultModifierValues (ppp->sep);
5936 }
5937 else
5938 {
5939 ppp->sep = SeqEntryFree (ppp->sep);
5940 }
5941
5942 sep = ppp->sep;
5943 if (sep != NULL) {
5944 SaveSeqEntryObjMgrData (ppp->sep, &omdptop, &omdata);
5945 GetSeqEntryParent (ppp->sep, &parentptr, &parenttype);
5946 SeqMgrLinkSeqEntry (sep, parenttype, parentptr);
5947 RestoreSeqEntryObjMgrData (sep, omdptop, &omdata);
5948
5949 FormatPhylipDoc (ppp);
5950 SafeShow (ppp->doc);
5951
5952 CountTitlesWithoutOrganisms (sep);
5953 } else {
5954 SendHelpScrollMessage (helpForm, "Nucleotide Page", "Nucleotide Page for Aligned Data Formats");
5955 SetPhylipDocInstructions (ppp);
5956 }
5957 } else {
5958 SetPhylipDocInstructions (ppp);
5959 }
5960 } else {
5961 sprintf (errStr, "ERROR: Unable to open file %s\n\n", path);
5962 AppendText (ppp->doc, errStr, &faParFmt, &faColFmt, programFont);
5963 AppendText (ppp->doc, strerror(errno), &faParFmt, &faColFmt, programFont);
5964 SafeShow (ppp->doc);
5965 Update ();
5966 }
5967 ArrowCursor ();
5968 Update ();
5969 return TRUE;
5970 }
5971
5972 static void CleanupPhylipDialog (GraphiC g, VoidPtr data)
5973
5974 {
5975 PhylipPagePtr ppp;
5976
5977 ppp = (PhylipPagePtr) data;
5978 if (ppp != NULL) {
5979 ResetPhylipPage (ppp);
5980 SequenceInfoFree (ppp->aln_settings);
5981 ppp->aln_settings = NULL;
5982 }
5983 MemFree (data);
5984 }
5985
5986
5987 static DialoG CreatePhylipDialog (GrouP h, CharPtr title, CharPtr text,
5988 Int2 format, CharPtr extension,
5989 Int4 type)
5990
5991 {
5992 PhylipPagePtr ppp;
5993 GrouP g;
5994 GrouP m;
5995 GrouP p;
5996 GrouP s;
5997 RecT r;
5998
5999 p = HiddenGroup (h, 1, 0, NULL);
6000 SetGroupSpacing (p, 10, 10);
6001
6002 ppp = (PhylipPagePtr) MemNew (sizeof (PhylipPage));
6003 if (ppp != NULL) {
6004
6005 SetObjectExtra (p, ppp, CleanupPhylipDialog);
6006 ppp->dialog = (DialoG) p;
6007 ppp->todialog = NULL;
6008 ppp->fromdialog = NULL;
6009 ppp->importdialog = ImportPhylipDialog;
6010 ppp->type = type;
6011
6012 if (title != NULL && title [0] != '\0') {
6013 s = NormalGroup (p, 0, -2, title, systemFont, NULL);
6014 } else {
6015 s = HiddenGroup (p, 0, -2, NULL);
6016 }
6017 m = HiddenGroup (s, -1, 0, NULL);
6018
6019 ppp->format = format;
6020 ppp->path [0] = '\0';
6021 StringNCpy_0 (ppp->extension, extension, sizeof (ppp->extension));
6022
6023 g = HiddenGroup (m, 0, 0, NULL);
6024 ppp->doc = DocumentPanel (g, stdCharWidth * 27, stdLineHeight * 8);
6025 ObjectRect (ppp->doc, &r);
6026 InsetRect (&r, 4, 4);
6027 faColFmt.pixWidth = r.right - r.left;
6028
6029 ppp->aln_settings = GetDefaultSequenceInfo();
6030
6031 SetPhylipDocInstructions (ppp);
6032 }
6033
6034 return (DialoG) p;
6035 }
6036
6037 #define NUCLEOTIDE_PAGE 0
6038 #define ORGANISM_PAGE 1
6039 #define MRNA_PAGE 2
6040 #define PROTEIN_PAGE 3
6041 #define ANNOTATE_PAGE 4
6042
6043 /*---------------------------------------------------------------------*/
6044 /* */
6045 /* HasZeroLengthSequence () -- Checks to see if any of a submission's */
6046 /* sequences are missing (ie -- zero */
6047 /* length). */
6048 /* */
6049 /*---------------------------------------------------------------------*/
6050
6051 extern Boolean HasZeroLengthSequence (ForM newForm)
6052 {
6053 SequencesFormPtr sqfp;
6054 FastaPagePtr fpp;
6055 SeqEntryPtr sep;
6056 BioseqPtr bsp;
6057
6058 /* Get the list of Bioseqs to check */
6059
6060 sqfp = (SequencesFormPtr) GetObjectExtra (newForm);
6061 if (NULL == sqfp)
6062 return TRUE;
6063
6064 fpp = GetObjectExtra (sqfp->dnaseq);
6065 sep = fpp->list;
6066
6067 /* Check the list */
6068
6069 while (NULL != sep) {
6070 if (sep->choice == 1) {
6071 bsp = (BioseqPtr) sep->data.ptrvalue;
6072 if (bsp->length <= 0)
6073 return TRUE;
6074 }
6075 sep = sep->next;
6076 }
6077
6078 /* If we made it to here, then */
6079 /* there were none found. */
6080
6081 return FALSE;
6082 }
6083
6084 extern Boolean SequencesFormHasProteins (ForM f)
6085
6086 {
6087 FastaPagePtr fpp;
6088 SequencesFormPtr sqfp;
6089
6090 sqfp = (SequencesFormPtr) GetObjectExtra (f);
6091 if (sqfp != NULL) {
6092 if (PackageTypeIsSet (sqfp->seqPackage)) return TRUE;
6093 fpp = GetObjectExtra (sqfp->protseq);
6094 if (fpp != NULL) {
6095 if (fpp->path [0] != '\0') {
6096 return TRUE;
6097 }
6098 }
6099 }
6100 return FALSE;
6101 }
6102
6103 extern SeqEntryPtr GetSequencesFormProteinList (ForM f)
6104
6105 {
6106 FastaPagePtr fpp;
6107 SequencesFormPtr sqfp;
6108
6109 sqfp = (SequencesFormPtr) GetObjectExtra (f);
6110 if (sqfp != NULL) {
6111 fpp = GetObjectExtra (sqfp->protseq);
6112 if (fpp != NULL) {
6113 return fpp->list;
6114 }
6115 }
6116 return NULL;
6117 }
6118
6119 static SeqEntryPtr GetSeqEntryFromSequencesForm (SequencesFormPtr sqfp)
6120 {
6121 SeqEntryPtr list = NULL;
6122 FastaPagePtr fpp;
6123 PhylipPagePtr ppp;
6124 SeqEntryPtr sep;
6125 BioseqSetPtr bssp;
6126
6127 if (sqfp == NULL) return NULL;
6128
6129 if (sqfp->seqPackage == SEQ_PKG_SEGMENTED)
6130 {
6131 fpp = (FastaPagePtr) GetObjectExtra (sqfp->dnaseq);
6132 if (fpp != NULL)
6133 {
6134 list = fpp->list;
6135 }
6136 }
6137 else if (sqfp->seqFormat == SEQ_FMT_FASTA) {
6138 fpp = (FastaPagePtr) GetObjectExtra (sqfp->dnaseq);
6139 if (fpp != NULL)
6140 {
6141 list = fpp->list;
6142 }
6143 } else if (sqfp->seqFormat == SEQ_FMT_ALIGNMENT) {
6144 ppp = (PhylipPagePtr) GetObjectExtra (sqfp->dnaseq);
6145 if (ppp != NULL) {
6146 sep = ppp->sep;
6147 if (sep != NULL && IS_Bioseq_set (sep)) {
6148 bssp = (BioseqSetPtr) sep->data.ptrvalue;
6149 if (bssp != NULL) {
6150 list = bssp->seq_set;
6151 }
6152 }
6153 }
6154 }
6155 return list;
6156 }
6157
6158 extern SeqEntryPtr GetSequencesFormNucleotideList (ForM f)
6159 {
6160 SequencesFormPtr sqfp;
6161
6162 sqfp = (SequencesFormPtr) GetObjectExtra (f);
6163 if (sqfp != NULL) {
6164 return GetSeqEntryFromSequencesForm (sqfp);
6165 }
6166 return NULL;
6167 }
6168
6169 extern Boolean SequencesFormHasTooManyNucleotides (ForM f)
6170
6171 {
6172 FastaPagePtr fpp;
6173 SequencesFormPtr sqfp;
6174
6175 sqfp = (SequencesFormPtr) GetObjectExtra (f);
6176 if (sqfp != NULL && PackageTypeIsSingle (sqfp->seqPackage))
6177 {
6178 fpp = GetObjectExtra (sqfp->dnaseq);
6179 if (fpp != NULL) {
6180 if (fpp->list != NULL && fpp->list->next != NULL) {
6181 return TRUE;
6182 }
6183 }
6184 }
6185 return FALSE;
6186 }
6187
6188 extern DialoG CreateTagListDialogEx (GrouP h, Uint2 rows, Uint2 cols,
6189 Int2 spacing, Uint2Ptr types,
6190 Uint2Ptr textWidths, EnumFieldAssocPtr PNTR alists,
6191 Boolean useBar, Boolean noExtend,
6192 ToDialogFunc tofunc, FromDialogFunc fromfunc);
6193
6194 static ValNodePtr
6195 BuildModifierTypeList
6196 (ValNodePtr type_list,
6197 CharPtr new_title,
6198 Boolean allow_prot)
6199 {
6200 ValNodePtr modifier_info_list;
6201 ValNodePtr info_vnp, type_vnp;
6202 ModifierInfoPtr mip;
6203
6204 modifier_info_list = ParseAllBracketedModifiers (new_title);
6205 for (info_vnp = modifier_info_list; info_vnp != NULL; info_vnp = info_vnp->next)
6206 {
6207 mip = (ModifierInfoPtr)info_vnp->data.ptrvalue;
6208 if (mip == NULL
6209 || mip->modtype == eModifierType_Protein
6210 || mip->modtype == eModifierType_Organism)
6211 {
6212 continue;
6213 }
6214 if (mip->modtype == eModifierType_SourceQual)
6215 {
6216 for (type_vnp = type_list;
6217 type_vnp != NULL
6218 && (type_vnp->choice != mip->subtype
6219 || StringICmp (type_vnp->data.ptrvalue, mip->name) != 0);
6220 type_vnp = type_vnp->next)
6221 {
6222 }
6223 }
6224 else
6225 {
6226 for (type_vnp = type_list;
6227 type_vnp != NULL && StringICmp (type_vnp->data.ptrvalue, mip->name) != 0;
6228 type_vnp = type_vnp->next)
6229 {
6230 }
6231 }
6232 if (type_vnp == NULL)
6233 {
6234 type_vnp = ValNodeNew (type_list);
6235 if (type_list == NULL) type_list = type_vnp;
6236 if (type_vnp != NULL)
6237 {
6238 type_vnp->choice = mip->subtype;
6239 type_vnp->data.ptrvalue = StringSave (mip->name);
6240 }
6241 }
6242 }
6243 ModifierInfoListFree (modifier_info_list);
6244 return type_list;
6245 }
6246
6247
6248 static Uint2 modedit_widths [] = {
6249 0, 0,
6250 };
6251
6252 ENUM_ALIST(nontextmodedit_alist)
6253 {"FALSE", 0},
6254 {"TRUE", 1},
6255 END_ENUM_ALIST
6256
6257 extern void ConfirmSequencesFormParsing (ForM f, FormActnFunc putItAllTogether)
6258
6259 {
6260 SequencesFormPtr sqfp;
6261
6262 sqfp = (SequencesFormPtr) GetObjectExtra (f);
6263 if (sqfp != NULL && putItAllTogether != NULL) {
6264 putItAllTogether (sqfp->form);
6265 }
6266 }
6267
6268 extern void AddToSubSource (BioSourcePtr biop, CharPtr title, CharPtr label, Uint1 subtype)
6269
6270 {
6271 CharPtr ptr;
6272 SubSourcePtr ssp;
6273 CharPtr str;
6274 SubSourcePtr tmpssp;
6275
6276 if (biop == NULL || title == NULL || label == NULL) return;
6277 str = MemNew (StringLen (title));
6278 if (str == NULL) return;
6279 ptr = StringISearch (title, label);
6280 if (ptr != NULL) {
6281 StringCpy (str, ptr + StringLen (label));
6282 ptr = StringChr (str, ']');
6283 if (ptr != NULL) {
6284 *ptr = '\0';
6285 TrimSpacesAroundString (str);
6286 ssp = SubSourceNew ();
6287 if (biop->subtype == NULL) {
6288 biop->subtype = ssp;
6289 } else {
6290 tmpssp = biop->subtype;
6291 while (tmpssp->next != NULL) {
6292 tmpssp = tmpssp->next;
6293 }
6294 tmpssp->next = ssp;
6295 }
6296 if (ssp != NULL) {
6297 ssp->subtype = subtype;
6298 ssp->name = StringSave (str);
6299 }
6300 }
6301 }
6302 MemFree (str);
6303 }
6304
6305 extern void AddToOrgMod (BioSourcePtr biop, CharPtr title, CharPtr label, Uint1 subtype)
6306
6307 {
6308 OrgModPtr mod;
6309 OrgNamePtr onp;
6310 OrgRefPtr orp;
6311 CharPtr ptr;
6312 CharPtr str;
6313 OrgModPtr tmpmod;
6314
6315 if (biop == NULL || title == NULL || label == NULL) return;
6316 str = MemNew (StringLen (title));
6317 if (str == NULL) return;
6318 ptr = StringISearch (title, label);
6319 if (ptr != NULL) {
6320 StringCpy (str, ptr + StringLen (label));
6321 ptr = StringChr (str, ']');
6322 if (ptr != NULL) {
6323 *ptr = '\0';
6324 TrimSpacesAroundString (str);
6325 orp = biop->org;
6326 if (orp == NULL) {
6327 orp = OrgRefNew ();
6328 biop->org = orp;
6329 }
6330 if (orp != NULL) {
6331 onp = orp->orgname;
6332 if (onp == NULL) {
6333 onp = OrgNameNew ();
6334 orp->orgname = onp;
6335 }
6336 if (onp != NULL) {
6337 mod = OrgModNew ();
6338 if (onp->mod == NULL) {
6339 onp->mod = mod;
6340 } else {
6341 tmpmod = onp->mod;
6342 while (tmpmod->next != NULL) {
6343 tmpmod = tmpmod->next;
6344 }
6345 tmpmod->next = mod;
6346 }
6347 if (mod != NULL) {
6348 mod->subtype = subtype;
6349 mod->subname = StringSave (str);
6350 }
6351 }
6352 }
6353 }
6354 }
6355 MemFree (str);
6356 }
6357
6358 #define PROC_NUC_STR_SIZE 4096
6359
6360 static Int4 TopologyFromString (CharPtr str)
6361 {
6362 EnumFieldAssocPtr eap;
6363
6364 for (eap = topology_nuc_alist; eap != NULL && eap->name != NULL; eap++)
6365 {
6366 if (StringICmp (eap->name, str) == 0)
6367 {
6368 return eap->value;
6369 }
6370 }
6371 return 1;
6372 }
6373
6374 static BioSourcePtr AddOrgRef (BioSourcePtr biop)
6375 {
6376 if (biop == NULL)
6377 {
6378 biop = BioSourceNew ();
6379 }
6380 if (biop == NULL)
6381 {
6382 return NULL;
6383 }
6384 if (biop->org == NULL)
6385 {
6386 biop->org = OrgRefNew ();
6387 }
6388 if (biop->org == NULL)
6389 {
6390 biop = BioSourceFree (biop);
6391 return NULL;
6392 }
6393 return biop;
6394 }
6395
6396 static BioSourcePtr AddOrgName (BioSourcePtr biop)
6397 {
6398 biop = AddOrgRef (biop);
6399 if (biop == NULL || biop->org == NULL)
6400 {
6401 biop = BioSourceFree (biop);
6402 return NULL;
6403 }
6404 if (biop->org->orgname == NULL)
6405 {
6406 biop->org->orgname = OrgNameNew ();
6407 if (biop->org->orgname == NULL)
6408 {
6409 biop = BioSourceFree (biop);
6410 return NULL;
6411 }
6412 }
6413 return biop;
6414 }
6415
6416 static BioSourcePtr SetGeneticCodeForBioSource (BioSourcePtr biop, Int4 gcode, Boolean is_nuc)
6417 {
6418 OrgRefPtr orp;
6419 OrgNamePtr onp;
6420
6421 if (gcode < 0)
6422 {
6423 return biop;
6424 }
6425
6426 biop = AddOrgName (biop);
6427 if (biop == NULL)
6428 {
6429 return biop;
6430 }
6431
6432 orp = biop->org;
6433 if (biop->org == NULL)
6434 {
6435 biop->org = OrgRefNew ();
6436 orp = biop->org;
6437 }
6438 if (orp != NULL) {
6439 onp = orp->orgname;
6440 if (onp == NULL) {
6441 onp = OrgNameNew ();
6442 orp->orgname = onp;
6443 }
6444 if (onp != NULL) {
6445 if (is_nuc)
6446 {
6447 onp->gcode = gcode;
6448 }
6449 else
6450 {
6451 onp->mgcode = gcode;
6452 }
6453 }
6454 }
6455 return biop;
6456 }
6457
6458 static BioSourcePtr
6459 SetGeneticCodeFromTitle
6460 (BioSourcePtr biop,
6461 CharPtr title,
6462 CharPtr mod_name,
6463 Boolean is_nuc)
6464 {
6465 CharPtr gcode_str;
6466 Int4 gcode;
6467 CharPtr next_org_loc;
6468
6469 if (StringHasNoText (title))
6470 {
6471 return biop;
6472 }
6473
6474 next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6475 gcode_str = FindValueFromPairInDeflineBeforeCharPtr (mod_name, title, next_org_loc);
6476 if (!StringHasNoText (gcode_str))
6477 {
6478 gcode = GeneticCodeFromString (gcode_str);
6479 biop = SetGeneticCodeForBioSource (biop, gcode, is_nuc);
6480 }
6481 if (gcode_str != NULL)
6482 {
6483 RemoveValueFromDefline (mod_name, title);
6484 }
6485 gcode_str = MemFree (gcode_str);
6486 return biop;
6487 }
6488
6489 static BioSourcePtr
6490 SetAllGeneticCodesFromTitle
6491 (BioSourcePtr biop,
6492 CharPtr title)
6493 {
6494 Int4 code_to_use;
6495 CharPtr location;
6496 CharPtr next_org_loc;
6497
6498 if (StringHasNoText (title))
6499 {
6500 return biop;
6501 }
6502
6503 next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6504 location = FindValueFromPairInDeflineBeforeCharPtr ("location", title, next_org_loc);
6505 if (!StringHasNoText (location))
6506 {
6507 code_to_use = UseGeneticCodeForLocation (location);
6508 if (code_to_use == USE_OTHER_GENETIC_CODE)
6509 {
6510 biop = SetGeneticCodeForBioSource (biop, 11, TRUE);
6511 RemoveValueFromDefline ("genetic_code", title);
6512 }
6513 else if (code_to_use == USE_NUCLEAR_GENETIC_CODE)
6514 {
6515 biop = SetGeneticCodeFromTitle (biop, title, "genetic_code", TRUE);
6516 }
6517 else if (code_to_use == USE_MITOCHONDRIAL_GENETIC_CODE)
6518 {
6519 biop = SetGeneticCodeFromTitle (biop, title, "genetic_code", FALSE);
6520 }
6521 }
6522 location = MemFree (location);
6523
6524 biop = SetGeneticCodeFromTitle (biop, title, "gcode", TRUE);
6525 biop = SetGeneticCodeFromTitle (biop, title, "mgcode", FALSE);
6526
6527 return biop;
6528 }
6529
6530 static void
6531 SetMoleculeAndMolTypeFromTitle
6532 (BioseqPtr bsp,
6533 CharPtr title,
6534 Int2 seqPackage)
6535 {
6536 SeqEntryPtr sep;
6537 ValNodePtr vnp;
6538 MolInfoPtr mip = NULL;
6539 Uint1 biomol;
6540 Int4 molecule;
6541 CharPtr valstr;
6542 CharPtr ptr;
6543 SeqLocPtr slp;
6544 BioseqPtr bsp_seg;
6545
6546 if (bsp == NULL)
6547 {
6548 return;
6549 }
6550
6551 sep = SeqMgrGetSeqEntryForData (bsp);
6552 if (sep == NULL)
6553 {
6554 return;
6555 }
6556
6557 vnp = SeqEntryGetSeqDescr (sep, Seq_descr_molinfo, NULL);
6558 if (vnp == NULL)
6559 {
6560 if (seqPackage == SEQ_PKG_SINGLE)
6561 {
6562 biomol = 3;
6563 molecule = Seq_mol_rna;
6564 }
6565 else
6566 {
6567 biomol = 1;
6568 molecule = Seq_mol_dna;
6569 }
6570 }
6571 else
6572 {
6573 mip = (MolInfoPtr) vnp->data.ptrvalue;
6574 biomol = mip->biomol;
6575 molecule = bsp->mol;
6576 }
6577
6578 /* get moltype from defline */
6579 valstr = FindValueFromPairInDefline ("moltype", title);
6580 if (!StringHasNoText (valstr))
6581 {
6582 biomol = MolTypeFromString (valstr);
6583 if (biomol == 1)
6584 {
6585 molecule = Seq_mol_na;
6586 }
6587 else if (biomol >= 2 && biomol <= 7)
6588 {
6589 molecule = Seq_mol_rna;
6590 }
6591 else if (biomol == 9)
6592 {
6593 molecule = Seq_mol_dna;
6594 }
6595 else if (biomol == 253)
6596 {
6597 molecule = Seq_mol_dna;
6598 biomol = 1;
6599 }
6600 else if (biomol == 254)
6601 {
6602 molecule = Seq_mol_rna;
6603 biomol = 1;
6604 }
6605 else if (biomol == 255)
6606 {
6607 molecule = Seq_mol_other;
6608 }
6609 }
6610 valstr = MemFree (valstr);
6611
6612 RemoveValueFromDefline ("moltype", title);
6613
6614 /* get molecule from defline */
6615 valstr = FindValueFromPairInDefline ("molecule", title);
6616 if (!StringHasNoText (valstr))
6617 {
6618 if (StringICmp (valstr, "dna") == 0) {
6619 molecule = Seq_mol_dna;
6620 } else if (StringICmp (valstr, "rna") == 0) {
6621 molecule = Seq_mol_rna;
6622 }
6623 }
6624 valstr = MemFree (valstr);
6625 RemoveValueFromDefline ("molecule", title);
6626
6627 ptr = StringISearch (title, "[dna]");
6628 if (ptr != NULL)
6629 {
6630 molecule = Seq_mol_dna;
6631 ExciseString (title, "[dna", "]");
6632 }
6633
6634 ptr = StringISearch (title, "[rna]");
6635 if (ptr != NULL)
6636 {
6637 molecule = Seq_mol_rna;
6638 ExciseString (title, "[rna", "]");
6639 }
6640
6641 if (mip == NULL)
6642 {
6643 vnp = CreateNewDescriptor (sep, Seq_descr_molinfo);
6644 mip = MolInfoNew ();
6645 vnp->data.ptrvalue = mip;
6646 }
6647
6648 mip->biomol = biomol;
6649 bsp->mol = molecule;
6650
6651 valstr = FindValueFromPairInDefline ("tech", title);
6652 if (!StringHasNoText (valstr))
6653 {
6654 ReadTechFromString (valstr, mip);
6655 }
6656 valstr = MemFree (valstr);
6657 RemoveValueFromDefline ("tech", title);
6658
6659 if (bsp->repr == Seq_repr_seg)
6660 {
6661 slp = (SeqLocPtr) bsp->seq_ext;
6662 while (slp != NULL)
6663 {
6664 bsp_seg = BioseqFind (SeqLocId (slp));
6665 sep = SeqMgrGetSeqEntryForData (bsp_seg);
6666 if (bsp_seg != NULL)
6667 {
6668 bsp_seg->mol = bsp->mol;
6669 }
6670 vnp = SeqEntryGetSeqDescr (sep, Seq_descr_molinfo, NULL);
6671 if (vnp == NULL)
6672 {
6673 vnp = CreateNewDescriptor (sep, Seq_descr_molinfo);
6674 }
6675 if (vnp != NULL)
6676 {
6677 vnp->data.ptrvalue = MolInfoFree (vnp->data.ptrvalue);
6678 vnp->data.ptrvalue = (MolInfoPtr) AsnIoMemCopy (mip, (AsnReadFunc) MolInfoAsnRead,
6679 (AsnWriteFunc) MolInfoAsnWrite);
6680 }
6681 slp = slp->next;
6682 }
6683 }
6684 }
6685
6686 static void AddGeneticCodeComment (BioseqPtr bsp, CharPtr comment)
6687 {
6688 SeqDescPtr sdp;
6689 UserObjectPtr uop = NULL;
6690 ObjectIdPtr oip;
6691 UserFieldPtr ufp, last_ufp = NULL;
6692 CharPtr comment_fmt = "Submitter genetic code: %s";
6693 CharPtr new_comment;
6694 Int4 new_comment_len;
6695
6696 if (bsp == NULL || StringHasNoText (comment))
6697 {
6698 return;
6699 }
6700
6701 sdp = bsp->descr;
6702 while (sdp != NULL && uop == NULL)
6703 {
6704 if (sdp->choice == Seq_descr_user && sdp->data.ptrvalue != NULL)
6705 {
6706 uop = (UserObjectPtr) sdp->data.ptrvalue;
6707 oip = uop->type;
6708 if (oip == NULL || StringCmp (oip->str, "Submission") != 0)
6709 {
6710 uop = NULL;
6711 }
6712 }
6713 sdp = sdp->next;
6714 }
6715
6716
6717 if (uop == NULL)
6718 {
6719 uop = UserObjectNew ();
6720 if (uop == NULL)
6721 {
6722 return;
6723 }
6724 uop->type = ObjectIdNew ();
6725 uop->type->str = StringSave ("Submission");
6726 ValNodeAddPointer (&bsp->descr, Seq_descr_user, uop);
6727 }
6728
6729 ufp = uop->data;
6730 while (ufp != NULL
6731 && (ufp->label == NULL
6732 || StringCmp (ufp->label->str, "AdditionalComment") != 0))
6733 {
6734 last_ufp = ufp;
6735 ufp = ufp->next;
6736 }
6737
6738 if (ufp == NULL)
6739 {
6740 ufp = UserFieldNew ();
6741 ufp->label = ObjectIdNew ();
6742 ufp->label->str = StringSave ("AdditionalComment");
6743 if (last_ufp == NULL)
6744 {
6745 uop->data = ufp;
6746 }
6747 else
6748 {
6749 last_ufp->next = ufp;
6750 }
6751 }
6752
6753 new_comment_len = StringLen (comment) + StringLen (comment_fmt);
6754 if (!StringHasNoText (ufp->data.ptrvalue))
6755 {
6756 new_comment_len += StringLen (ufp->data.ptrvalue);
6757 }
6758 new_comment = (CharPtr) MemNew (new_comment_len * sizeof (Char));
6759 sprintf (new_comment, comment_fmt, comment);
6760
6761 if (!StringHasNoText (ufp->data.ptrvalue))
6762 {
6763 StringCat (new_comment, ufp->data.ptrvalue);
6764 }
6765
6766 ufp->data.ptrvalue = MemFree (ufp->data.ptrvalue);
6767 ufp->data.ptrvalue = new_comment;
6768 }
6769
6770 static BioSourcePtr AddOrgModValue (BioSourcePtr biop, Uint1 subtype, CharPtr subname)
6771 {
6772 OrgModPtr mod;
6773
6774 if (subname == NULL)
6775 {
6776 return biop;
6777 }
6778
6779 biop = AddOrgName (biop);
6780 if (biop != NULL)
6781 {
6782 mod = OrgModNew ();
6783 if (mod != NULL)
6784 {
6785 mod->subtype = subtype;
6786 mod->subname = subname;
6787 subname = NULL;
6788 mod->next = biop->org->orgname->mod;
6789 biop->org->orgname->mod = mod;
6790 }
6791 }
6792 subname = MemFree (subname);
6793 return biop;
6794 }
6795
6796 static BioSourcePtr AddSubSourceValue (BioSourcePtr biop, Uint1 subtype, CharPtr subname)
6797 {
6798 SubSourcePtr ssp;
6799
6800 if (subname == NULL)
6801 {
6802 return biop;
6803 }
6804
6805 if (biop == NULL)
6806 {
6807 biop = BioSourceNew ();
6808 }
6809 if (biop != NULL)
6810 {
6811 ssp = SubSourceNew ();
6812 if (ssp != NULL)
6813 {
6814 ssp->subtype = subtype;
6815 ssp->name = subname;
6816 subname = NULL;
6817 ssp->next = biop->subtype;
6818 biop->subtype = ssp;
6819 }
6820 }
6821 subname = MemFree (subname);
6822 return biop;
6823 }
6824
6825 extern BioSourcePtr
6826 ExtractFromTitleToBioSourceOrgMod
6827 (CharPtr title,
6828 BioSourcePtr biop,
6829 CharPtr mod_name,
6830 Int4 subtype)
6831 {
6832 CharPtr valstr;
6833 CharPtr next_org_loc;
6834
6835 next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6836 while ((valstr = FindValueFromPairInDeflineBeforeCharPtr (mod_name, title, next_org_loc)) != NULL)
6837 {
6838 biop = AddOrgModValue (biop, subtype, valstr);
6839 RemoveValueFromDefline (mod_name, title);
6840 next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6841 }
6842 return biop;
6843 }
6844
6845 extern BioSourcePtr
6846 ExtractFromTitleToBioSourceSubSource
6847 (CharPtr title,
6848 BioSourcePtr biop,
6849 CharPtr mod_name,
6850 Int4 subtype)
6851 {
6852 CharPtr valstr;
6853 CharPtr next_org_loc;
6854
6855 next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6856 while ((valstr = FindValueFromPairInDeflineBeforeCharPtr (mod_name, title, next_org_loc)) != NULL)
6857 {
6858 if (IsNonTextModifier (mod_name)) {
6859 if (StringICmp (valstr, "FALSE") == 0) {
6860 valstr = MemFree (valstr);
6861 } else if (StringICmp (valstr, "TRUE") == 0) {
6862 biop = AddSubSourceValue (biop, subtype, StringSave (""));
6863 valstr = MemFree (valstr);
6864 } else {
6865 biop = AddSubSourceValue (biop, subtype, valstr);
6866 }
6867 } else {
6868 biop = AddSubSourceValue (biop, subtype, valstr);
6869 }
6870 RemoveValueFromDefline (mod_name, title);
6871 next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6872 }
6873 return biop;
6874 }
6875
6876 /* this function collects all of the common names prior to the next organism name
6877 * and assembles a semicolon-delimited list.
6878 */
6879 extern BioSourcePtr
6880 ExtractFromTitleToBioSourceCommonName
6881 (CharPtr title,
6882 BioSourcePtr biop)
6883 {
6884 CharPtr valstr, new_val;
6885 Int4 new_len;
6886 CharPtr next_org_loc;
6887
6888 next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6889 while ((valstr = FindValueFromPairInDeflineBeforeCharPtr ("common name", title, next_org_loc)) != NULL)
6890 {
6891 if (!StringHasNoText (valstr))
6892 {
6893 biop = AddOrgRef (biop);
6894 if (StringHasNoText (biop->org->common))
6895 {
6896 biop->org->common = MemFree (biop->org->common);
6897 biop->org->common = valstr;
6898 valstr = NULL;
6899 }
6900 else
6901 {
6902 new_len = StringLen (biop->org->common) + StringLen (valstr) + 3;
6903 new_val = (CharPtr) MemNew (new_len * sizeof (Char));
6904 if (new_val != NULL)
6905 {
6906 sprintf (new_val, "%s; %s", biop->org->common, valstr);
6907 biop->org->common = MemFree (biop->org->common);
6908 biop->org->common = new_val;
6909 }
6910 }
6911 }
6912 valstr = MemFree (valstr);
6913 RemoveValueFromDefline ("common name", title);
6914 next_org_loc = FindValuePairInDefLine ("organism", title, NULL);
6915 }
6916 return biop;
6917 }
6918
6919 /* When the user specifies multiple organisms on the definition line, modifiers after the
6920 * second organism go with the second organism, after the third organism go with the third
6921 * organism, etc.
6922 */
6923 extern BioSourcePtr ExtractFromDeflineToBioSource (CharPtr defline, BioSourcePtr biop)
6924 {
6925 CharPtr taxname = NULL;
6926 OrgInfoPtr oip = NULL;
6927 CharPtr valstr;
6928 Nlm_EnumFieldAssocPtr ap;
6929 Nlm_QualNameAssocPtr qp;
6930 CharPtr next_org_loc;
6931
6932 if (StringHasNoText (defline))
6933 {
6934 return NULL;
6935 }
6936
6937 taxname = FindValueFromPairInDefline ("organism", defline);
6938 RemoveValueFromDefline ("organism", defline);
6939 if (StringHasNoText (taxname))
6940 {
6941 taxname = MemFree (taxname);
6942 return NULL;
6943 }
6944 else
6945 {
6946 biop = AddOrgRef (biop);
6947 if (biop == NULL)
6948 {
6949 return biop;
6950 }
6951 LoadOrganismList ();
6952 oip = FindByTaxName (taxname);
6953 SetTaxNameAndRemoveTaxRef (biop->org, taxname);
6954 }
6955
6956 /* add division */
6957 if (oip != NULL && !StringHasNoText (oip->div))
6958 {
6959 biop = AddOrgName (biop);
6960 if (biop == NULL)
6961 {
6962 return biop;
6963 }
6964 biop->org->orgname->div = StringSave (oip->div);
6965 }
6966
6967 /* add common name (s) - if there are multiple entries, separate with semicolon */
6968 biop = ExtractFromTitleToBioSourceCommonName (defline, biop);
6969 /* if common name was not supplied in defline, use common name from organism list */
6970 if (biop->org == NULL || StringHasNoText (biop->org->common))
6971 {
6972 if (oip != NULL && !StringHasNoText (oip->common))
6973 {
6974 biop = AddOrgRef (biop);
6975 if (biop == NULL)
6976 {
6977 return biop;
6978 }
6979 biop->org->common = StringSave (oip->common);
6980 }
6981 }
6982
6983 /* add lineage */
6984 if (oip != NULL && !StringHasNoText (oip->lineage))
6985 {
6986 biop = AddOrgName (biop);
6987 if (biop == NULL)
6988 {
6989 return biop;
6990 }
6991 biop->org->orgname->lineage = StringSave (oip->lineage);
6992 }
6993
6994 /* add origin */
6995 next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
6996 valstr = FindValueFromPairInDeflineBeforeCharPtr ("origin", defline, next_org_loc);
6997 if (!StringHasNoText (valstr))
6998 {
6999 for (ap = biosource_origin_alist; ap->name != NULL; ap++) {
7000 if (StringICmp (valstr, ap->name) == 0) {
7001 if (biop == NULL)
7002 {
7003 biop = BioSourceNew ();
7004 }
7005 if (biop == NULL)
7006 {
7007 return biop;
7008 }
7009 biop->origin = (Uint1) ap->value;
7010 }
7011 }
7012 }
7013 if (valstr != NULL)
7014 {
7015 RemoveValueFromDefline ("origin", defline);
7016 next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7017 }
7018 valstr = MemFree (valstr);
7019
7020 valstr = FindValueFromPairInDeflineBeforeCharPtr ("lineage", defline, next_org_loc);
7021 if (!StringHasNoText (valstr))
7022 {
7023 biop = AddOrgName (biop);
7024 }
7025 if (!StringHasNoText (valstr) && StringCmp (valstr, biop->org->orgname->lineage) != 0)
7026 {
7027 biop = AddOrgModValue (biop, ORGMOD_old_lineage, valstr);
7028 valstr = NULL;
7029 }
7030 if (valstr != NULL)
7031 {
7032 RemoveValueFromDefline ("lineage", defline);
7033 }
7034 valstr = MemFree (valstr);
7035
7036 biop = SetAllGeneticCodesFromTitle (biop, defline);
7037 next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7038
7039 for (qp = current_orgmod_subtype_alist; qp->name != NULL; qp++) {
7040 biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, qp->name, qp->value);
7041 }
7042 for (qp = current_subsource_subtype_alist; qp->name != NULL; qp++) {
7043 biop = ExtractFromTitleToBioSourceSubSource (defline, biop, qp->name, qp->value);
7044 }
7045
7046 biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "note-orgmod", 255);
7047 biop = ExtractFromTitleToBioSourceSubSource (defline, biop, "note-subsrc", 255);
7048 biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "note", 255);
7049 biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "comment", 255);
7050 biop = ExtractFromTitleToBioSourceSubSource (defline, biop, "subsource", 255);
7051
7052
7053 next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7054
7055 /* set location */
7056 valstr = FindValueFromPairInDeflineBeforeCharPtr ("location", defline, next_org_loc);
7057 if (StringHasNoText (valstr))
7058 {
7059 if (biop == NULL)
7060 {
7061 biop = BioSourceNew ();
7062 }
7063 if (biop == NULL)
7064 {
7065 return biop;
7066 }
7067 biop->genome = 1;
7068 }
7069 else if (StringICmp (valstr, "Mitochondrial") == 0)
7070 {
7071 if (biop == NULL)
7072 {
7073 biop = BioSourceNew ();
7074 }
7075 if (biop == NULL)
7076 {
7077 return biop;
7078 }
7079 biop->genome = 5;
7080 }
7081 else
7082 {
7083 for (ap = biosource_genome_simple_alist; ap->name != NULL; ap++) {
7084 if (StringICmp (valstr, ap->name) == 0) {
7085 if (biop == NULL)
7086 {
7087 biop = BioSourceNew ();
7088 }
7089 if (biop == NULL)
7090 {
7091 return biop;
7092 }
7093 biop->genome = (Uint1) ap->value;
7094 }
7095 }
7096 }
7097 if (valstr != NULL)
7098 {
7099 RemoveValueFromDefline ("location", defline);
7100 }
7101 valstr = MemFree (valstr);
7102
7103 TrimSpacesAroundString (defline);
7104
7105 return biop;
7106
7107 }
7108
7109 extern Boolean ProcessOneNucleotideTitle (Int2 seqPackage,
7110 SeqEntryPtr nsep, SeqEntryPtr top);
7111
7112
7113 static void ParseDeflineToBiop(CharPtr defline, BioSourcePtr biop)
7114 {
7115 CharPtr taxname = NULL;
7116 OrgInfoPtr oip = NULL;
7117 CharPtr valstr;
7118 EnumFieldAssocPtr ap;
7119 Nlm_QualNameAssocPtr qp;
7120 CharPtr next_org_loc;
7121
7122 if (StringHasNoText (defline) || biop == NULL)
7123 {
7124 return;
7125 }
7126
7127 taxname = FindValueFromPairInDefline ("organism", defline);
7128 RemoveValueFromDefline ("organism", defline);
7129 if (StringHasNoText (taxname))
7130 {
7131 taxname = MemFree (taxname);
7132 }
7133 else
7134 {
7135 biop = AddOrgRef (biop);
7136 biop->org->taxname = taxname;
7137 LoadOrganismList ();
7138 oip = FindByTaxName (taxname);
7139 }
7140
7141 /* add division */
7142 if (oip != NULL && !StringHasNoText (oip->div))
7143 {
7144 biop = AddOrgName (biop);
7145 biop->org->orgname->div = StringSave (oip->div);
7146 }
7147
7148 /* add common name (s) - if there are multiple entries, separate with semicolon */
7149 biop = ExtractFromTitleToBioSourceCommonName (defline, biop);
7150 /* if common name was not supplied in defline, use common name from organism list */
7151 if (biop->org == NULL || StringHasNoText (biop->org->common))
7152 {
7153 if (oip != NULL && !StringHasNoText (oip->common))
7154 {
7155 biop = AddOrgRef (biop);
7156 biop->org->common = StringSave (oip->common);
7157 }
7158 }
7159
7160 /* add lineage */
7161 if (oip != NULL && !StringHasNoText (oip->lineage))
7162 {
7163 biop = AddOrgName (biop);
7164 biop->org->orgname->lineage = StringSave (oip->lineage);
7165 }
7166
7167 /* add origin */
7168 next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7169 valstr = FindValueFromPairInDeflineBeforeCharPtr ("origin", defline, next_org_loc);
7170 if (!StringHasNoText (valstr))
7171 {
7172 for (ap = biosource_origin_alist; ap->name != NULL; ap++) {
7173 if (StringICmp (valstr, ap->name) == 0) {
7174 biop->origin = (Uint1) ap->value;
7175 }
7176 }
7177 }
7178 if (valstr != NULL)
7179 {
7180 RemoveValueFromDefline ("origin", defline);
7181 next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7182 }
7183 valstr = MemFree (valstr);
7184
7185 valstr = FindValueFromPairInDeflineBeforeCharPtr ("lineage", defline, next_org_loc);
7186 if (!StringHasNoText (valstr))
7187 {
7188 biop = AddOrgName (biop);
7189 }
7190 if (!StringHasNoText (valstr) && StringCmp (valstr, biop->org->orgname->lineage) != 0)
7191 {
7192 biop = AddOrgModValue (biop, ORGMOD_old_lineage, valstr);
7193 valstr = NULL;
7194 }
7195 if (valstr != NULL)
7196 {
7197 RemoveValueFromDefline ("lineage", defline);
7198 }
7199 valstr = MemFree (valstr);
7200
7201 biop = SetAllGeneticCodesFromTitle (biop, defline);
7202 next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7203
7204 for (qp = current_orgmod_subtype_alist; qp->name != NULL; qp++) {
7205 biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, qp->name, qp->value);
7206 }
7207 for (qp = current_subsource_subtype_alist; qp->name != NULL; qp++) {
7208 biop = ExtractFromTitleToBioSourceSubSource (defline, biop, qp->name, qp->value);
7209 }
7210
7211 biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "note-orgmod", 255);
7212 biop = ExtractFromTitleToBioSourceSubSource (defline, biop, "note-subsrc", 255);
7213 biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "note", 255);
7214 biop = ExtractFromTitleToBioSourceOrgMod (defline, biop, "comment", 255);
7215 biop = ExtractFromTitleToBioSourceSubSource (defline, biop, "subsource", 255);
7216
7217
7218 next_org_loc = FindValuePairInDefLine ("organism", defline, NULL);
7219
7220 /* set location */
7221 valstr = FindValueFromPairInDeflineBeforeCharPtr ("location", defline, next_org_loc);
7222 if (StringHasNoText (valstr))
7223 {
7224 /* don't set defaults */
7225 }
7226 else if (StringICmp (valstr, "Mitochondrial") == 0)
7227 {
7228 biop->genome = 5;
7229 }
7230 else
7231 {
7232 for (ap = biosource_genome_simple_alist; ap->name != NULL; ap++) {
7233 if (StringICmp (valstr, ap->name) == 0) {
7234 biop->genome = (Uint1) ap->value;
7235 }
7236 }
7237 }
7238 if (valstr != NULL)
7239 {
7240 RemoveValueFromDefline ("location", defline);
7241 }
7242 valstr = MemFree (valstr);
7243
7244 TrimSpacesAroundString (defline);
7245 }
7246
7247
7248 static void ParseModifiersFromDeflineCallback (BioseqPtr bsp, Pointer userdata)
7249 {
7250 CharPtr title;
7251 SeqDescrPtr sdp, sdp_biop, prev_sdp = NULL;
7252 BioSourcePtr biop = NULL;
7253 CharPtr valstr;
7254 SeqMgrDescContext context;
7255
7256 if (bsp == NULL) return;
7257
7258 if (ISA_aa(bsp->mol)) {
7259 return;
7260 }
7261
7262 sdp = bsp->descr;
7263 while (sdp != NULL && sdp->choice != Seq_descr_title) {
7264 prev_sdp = sdp;
7265 sdp = sdp->next;
7266 }
7267 if (sdp == NULL || sdp->data.ptrvalue == NULL) {
7268 return;
7269 }
7270
7271 title = sdp->data.ptrvalue;
7272
7273 if (StringChr(title, '[') == NULL || StringChr(title, ']') == NULL) {
7274 return;
7275 }
7276
7277 /* parse moltype values */
7278 SetMoleculeAndMolTypeFromTitle (bsp, title, SEQ_PKG_GENBANK);
7279
7280 /* get topology from defline */
7281 valstr = FindValueFromPairInDefline ("topology", title);
7282 if (valstr != NULL)
7283 {
7284 if (!StringHasNoText (valstr))
7285 {
7286 bsp->topology = TopologyFromString (valstr);
7287 }
7288 RemoveValueFromDefline ("topology", title);
7289 valstr = MemFree (valstr);
7290 }
7291
7292 /* add bankit comment for genetic code */
7293 valstr = FindValueFromPairInDefline ("gencode_comment", title);
7294 if (valstr != NULL)
7295 {
7296 AddGeneticCodeComment (bsp, valstr);
7297 RemoveValueFromDefline ("gencode_comment", title);
7298 valstr = MemFree (valstr);
7299 }
7300
7301 sdp_biop = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
7302
7303 if(sdp_biop == NULL) {
7304 if (bsp->idx.parenttype == OBJ_BIOSEQSET
7305 && bsp->idx.parentptr != NULL
7306 && ((BioseqSetPtr)bsp->idx.parentptr)->_class == BioseqseqSet_class_parts) {
7307 /* don't put sources on parts */
7308 } else {
7309 biop = BioSourceNew();
7310 sdp_biop = SeqDescrNew(bsp->descr);
7311 sdp_biop->choice = Seq_descr_source;
7312 sdp_biop->data.ptrvalue = biop;
7313 }
7314 } else {
7315 biop = sdp_biop->data.ptrvalue;
7316 }
7317
7318 ParseDeflineToBiop (title, biop);
7319
7320 if (StringHasNoText (title)) {
7321 /* remove empty defline */
7322 sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
7323 if (prev_sdp == NULL) {
7324 bsp->descr = sdp->next;
7325 } else {
7326 prev_sdp->next = sdp->next;
7327 }
7328 sdp->next = NULL;
7329 sdp = SeqDescrFree (sdp);
7330 }
7331 }
7332
7333
7334 extern void ParseModifiersFromDefline (IteM i)
7335 {
7336 BaseFormPtr bfp;
7337 SeqEntryPtr sep;
7338
7339 #ifdef WIN_MAC
7340 bfp = currentFormDataPtr;
7341 #else
7342 bfp = GetObjectExtra (i);
7343 #endif
7344 if (bfp == NULL) return;
7345 sep = GetTopSeqEntryForEntityID (bfp->input_entityID);
7346 if (sep == NULL) return;
7347
7348 VisitBioseqsInSep (sep, NULL, ParseModifiersFromDeflineCallback);
7349 Update ();
7350 ObjMgrSetDirtyFlag (bfp->input_entityID, TRUE);
7351 ObjMgrSendMsg (OM_MSG_UPDATE, bfp->input_entityID, 0, 0);
7352 }
7353
7354
7355 extern Boolean ProcessOneNucleotideTitle (Int2 seqPackage,
7356 SeqEntryPtr nsep, SeqEntryPtr top)
7357
7358 {
7359 BioSourcePtr biop = NULL;
7360 BioseqSetPtr bssp;
7361 BioseqPtr nbsp;
7362 Boolean needbiop;
7363 SeqEntryPtr sep;
7364 CharPtr str;
7365 CharPtr valstr;
7366 CharPtr title;
7367 ValNodePtr vnp;
7368 Int4 topology;
7369 #if 0
7370 SeqFeatPtr sfp;
7371 #endif
7372
7373 if (nsep == NULL || top == NULL) return FALSE;
7374 nbsp = (BioseqPtr) nsep->data.ptrvalue;
7375 if (nbsp == NULL) return FALSE;
7376 if (! ISA_na (nbsp->mol)) return FALSE;
7377 str = MemNew (PROC_NUC_STR_SIZE * sizeof (Char));
7378 if (str == NULL) return FALSE;
7379 sep = NULL;
7380
7381 SeqEntryExplore (top, (Pointer) &sep, FindFirstSeqEntryTitle);
7382 sep = FindNucSeqEntry (sep);
7383 if (sep != NULL) {
7384 vnp = SeqEntryGetSeqDescr (sep, Seq_descr_title, NULL);
7385 if (vnp != NULL && vnp->data.ptrvalue != NULL) {
7386 title = (CharPtr) vnp->data.ptrvalue;
7387
7388 SetMoleculeAndMolTypeFromTitle (nbsp, title, seqPackage);
7389
7390 if (nbsp->topology == 0)
7391 {
7392 topology = TOPOLOGY_LINEAR;
7393 }
7394 else
7395 {
7396 topology = nbsp->topology;
7397 }
7398
7399 /* get topology from defline */
7400 valstr = FindValueFromPairInDefline ("topology", title);
7401 if (valstr != NULL)
7402 {
7403 if (!StringHasNoText (valstr))
7404 {
7405 topology = TopologyFromString (valstr);
7406 }
7407 RemoveValueFromDefline ("topology", title);
7408 valstr = MemFree (valstr);
7409 }
7410 nbsp->topology = topology;
7411
7412 /* add bankit comment for genetic code */
7413 valstr = FindValueFromPairInDefline ("gencode_comment", title);
7414 if (valstr != NULL)
7415 {
7416 AddGeneticCodeComment (nbsp, valstr);
7417 RemoveValueFromDefline ("gencode_comment", title);
7418 valstr = MemFree (valstr);
7419 }
7420
7421 needbiop = FALSE;
7422
7423 if (PackageTypeIsSet (seqPackage)
7424 || seqPackage == SEQ_PKG_GENBANK)
7425 {
7426 needbiop = TRUE;
7427 if (GetAppParam ("SEQUIN", "PREFERENCES", "BIOSRCONALL", NULL, str, PROC_NUC_STR_SIZE)) {
7428 if (StringICmp (str, "FALSE") == 0) {
7429 needbiop = FALSE;
7430 }
7431 }
7432 }
7433
7434 vnp = SeqEntryGetSeqDescr (sep, Seq_descr_source, NULL);
7435 if (vnp == NULL)
7436 {
7437 biop = ExtractFromDeflineToBioSource (title, NULL);
7438 if (biop == NULL && needbiop)
7439 {
7440 biop = BioSourceNew ();
7441 }
7442
7443 if (biop != NULL)
7444 {
7445 vnp = CreateNewDescriptor (top, Seq_descr_source);
7446 if (vnp != NULL) {
7447 vnp->data.ptrvalue = (Pointer) biop;
7448 }
7449 }
7450 #if 0
7451 biop = BioSourceFromDefline (title);
7452 while (biop != NULL)
7453 {
7454 sfp = CreateNewFeature (sep, NULL, SEQFEAT_BIOSRC, NULL);
7455 if (sfp != NULL)
7456 {
7457 sfp->data.value.ptrvalue = biop;
7458 }
7459 biop = BioSourceFromDefline (title);
7460 }
7461 #endif
7462 }
7463
7464 if (StringHasNoText (title) || sep != top) {
7465 vnp = NULL;
7466 if (IS_Bioseq (sep)) {
7467 nbsp = (BioseqPtr) sep->data.ptrvalue;
7468 vnp = ValNodeExtract (&(nbsp->descr), Seq_descr_title);
7469 } else if (IS_Bioseq_set (sep)) {
7470 bssp = (BioseqSetPtr) sep->data.ptrvalue;
7471 vnp = ValNodeExtract (&(bssp->descr), Seq_descr_title);
7472 }
7473 if (vnp != NULL && StringHasNoText ((CharPtr) vnp->data.ptrvalue)) {
7474 vnp = ValNodeFreeData (vnp);
7475 }
7476 if (sep != top && vnp != NULL) {
7477 if (IS_Bioseq (top)) {
7478 nbsp = (BioseqPtr) top->data.ptrvalue;
7479 ValNodeLink (&(nbsp->descr), vnp);
7480 } else if (IS_Bioseq_set (top)) {
7481 bssp = (BioseqSetPtr) top->data.ptrvalue;
7482 ValNodeLink (&(bssp->descr), vnp);
7483 }
7484 }
7485 }
7486 }
7487 } else {
7488 needbiop = FALSE;
7489 if (PackageTypeIsSet (seqPackage)
7490 || seqPackage == SEQ_PKG_GENOMICCDNA)
7491 {
7492 needbiop = TRUE;
7493 if (GetAppParam ("SEQUIN", "PREFERENCES", "BIOSRCONALL", NULL, str, PROC_NUC_STR_SIZE)) {
7494 if (StringICmp (str, "FALSE") == 0) {
7495 needbiop = FALSE;
7496 }
7497 }
7498 }
7499 }
7500 MemFree (str);
7501
7502 return TRUE;
7503 }
7504
7505 static Boolean AutomaticNucleotideProcess (SequencesFormPtr sqfp, SeqEntryPtr nsep,
7506 SeqEntryPtr top)
7507
7508 {
7509 BioseqSetPtr bssp;
7510 Boolean rsult;
7511 SeqEntryPtr tmp;
7512
7513 if (sqfp == NULL || nsep == NULL || top == NULL) return FALSE;
7514 if (IS_Bioseq_set (nsep)) {
7515 bssp = (BioseqSetPtr) nsep->data.ptrvalue;
7516 rsult = FALSE;
7517 if (bssp != NULL) {
7518 for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
7519 if (AutomaticNucleotideProcess (sqfp, tmp, top)) {
7520 rsult = TRUE;
7521 }
7522 }
7523 }
7524 return rsult;
7525 }
7526 return ProcessOneNucleotideTitle (sqfp->seqPackage,
7527 nsep, top);
7528 }
7529
7530 typedef struct idlist {
7531 BioseqPtr bsp;
7532 CharPtr key;
7533 struct idlist PNTR left;
7534 struct idlist PNTR right;
7535 } IdList, PNTR IdListPtr;
7536
7537 static void BuildTree (IdListPtr PNTR head, BioseqPtr bsp, CharPtr x)
7538
7539 {
7540 Int2 comp;
7541 IdListPtr idlist;
7542 SeqIdPtr sip;
7543 CharPtr str;
7544
7545 if (*head != NULL) {
7546 idlist = *head;
7547 comp = StringICmp (idlist->key, x);
7548 if (comp < 0) {
7549 BuildTree (&(idlist->right), bsp, x);
7550 } else if (comp > 0) {
7551 BuildTree (&(idlist->left), bsp, x);
7552 } else {
7553 sip = MakeNewProteinSeqId (NULL, NULL);
7554 if (sip != NULL) {
7555 bsp->id = SeqIdFree (bsp->id);
7556 bsp->id = sip;
7557 SeqMgrReplaceInBioseqIndex (bsp);
7558 str = SeqIdWholeLabel (SeqIdFindWorst (bsp->id), PRINTID_REPORT);
7559 BuildTree (head, bsp, str);
7560 str = MemFree (str);
7561 }
7562 }
7563 } else {
7564 idlist = MemNew (sizeof (IdList));
7565 if (idlist != NULL) {
7566 *head = idlist;
7567 idlist->bsp = bsp;
7568 idlist->key = SeqIdWholeLabel (SeqIdFindWorst (bsp->id), PRINTID_REPORT);
7569 idlist->left = NULL;
7570 idlist->right = NULL;
7571 }
7572 }
7573 }
7574
7575 static void FreeTree (IdListPtr PNTR head)
7576
7577 {
7578 IdListPtr idlist;
7579
7580 if (head != NULL && *head != NULL) {
7581 idlist = *head;
7582 FreeTree (&(idlist->left));
7583 FreeTree (&(idlist->right));
7584 MemFree (idlist->key);
7585 MemFree (idlist);
7586 }
7587 }
7588
7589 static void ResolveCollidingIDs (IdListPtr PNTR head, SeqEntryPtr list)
7590
7591 {
7592 BioseqPtr bsp;
7593 CharPtr str;
7594
7595 if (head == NULL) return;
7596 while (list != NULL) {
7597 if (IS_Bioseq (list)) {
7598 bsp = (BioseqPtr) list->data.ptrvalue;
7599 if (bsp != NULL) {
7600 str = SeqIdWholeLabel (SeqIdFindWorst (bsp->id), PRINTID_REPORT);
7601 BuildTree (head, bsp, str);
7602 str = MemFree (str);
7603 }
7604 }
7605 list = list->next;
7606 }
7607 }
7608
7609
7610 static void PutMolInfoOnSeqEntry (SequencesFormPtr sqfp, SeqEntryPtr sep)
7611
7612 {
7613 BioseqSetPtr bssp;
7614 MolInfoPtr mip;
7615 ValNodePtr vnp;
7616
7617 if (sqfp != NULL && sep != NULL) {
7618 if (IS_Bioseq_set (sep))
7619 {
7620 bssp = (BioseqSetPtr) sep->data.ptrvalue;
7621 for (sep = bssp->seq_set; sep != NULL; sep = sep->next)
7622 {
7623 PutMolInfoOnSeqEntry (sqfp, sep);
7624 }
7625 return;
7626 }
7627
7628 vnp = SeqEntryGetSeqDescr (sep, Seq_descr_molinfo, NULL);
7629 if (vnp == NULL)
7630 {
7631 vnp = CreateNewDescriptor (sep, Seq_descr_molinfo);
7632 }
7633 if (vnp != NULL)
7634 {
7635 mip = (MolInfoPtr) vnp->data.ptrvalue;
7636 if (mip == NULL)
7637 {
7638 mip = MolInfoNew ();
7639 vnp->data.ptrvalue = mip;
7640 }
7641 }
7642 }
7643 }
7644
7645 static void PrefixOrgToDefline (SeqEntryPtr sep)
7646
7647 {
7648 BioSourcePtr biop;
7649 BioseqPtr bsp;
7650 BioseqSetPtr bssp;
7651 CharPtr def;
7652 OrgRefPtr orp;
7653 CharPtr ptr;
7654 CharPtr str;
7655 Char taxname [64];
7656 ValNodePtr ttl;
7657 ValNodePtr vnp;
7658
7659 if (sep == NULL) return;
7660 if (IS_Bioseq_set (sep)) {
7661 bssp = (BioseqSetPtr) sep->data.ptrvalue;
7662 if (bssp != NULL && (bssp->_class == 7 ||
7663 (IsPopPhyEtcSet (bssp->_class)))) {
7664 for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
7665 PrefixOrgToDefline (sep);
7666 }
7667 return;
7668 }
7669 }
7670
7671 if (! IS_Bioseq (sep)) return;
7672 bsp = (BioseqPtr) sep->data.ptrvalue;
7673 if (bsp == NULL) return;
7674
7675 taxname [0] = '\0';
7676 orp = NULL;
7677 biop = NULL;
7678 ttl = NULL;
7679 vnp = bsp->descr;
7680 for (vnp = bsp->descr; vnp != NULL; vnp = vnp->next) {
7681 if (vnp->choice == Seq_descr_source) {
7682 biop = (BioSourcePtr) vnp->data.ptrvalue;
7683 } else if (vnp->choice == Seq_descr_org) {
7684 orp = (OrgRefPtr) vnp->data.ptrvalue;
7685 } else if (vnp->choice == Seq_descr_title) {
7686 ttl = vnp;
7687 }
7688 }
7689 if (orp == NULL && biop != NULL) {
7690 orp = biop->org;
7691 }
7692 if (orp == NULL) return;
7693 if (ttl == NULL) return;
7694 StringNCpy_0 (taxname, orp->taxname, sizeof (taxname));
7695 ptr = StringSearch (taxname, "(");
7696 if (ptr != NULL) {
7697 *ptr = '\0';
7698 }
7699 TrimSpacesAroundString (taxname);
7700 if ((StringICmp (taxname, "Human immunodeficiency virus type 1") == 0) ||
7701 (StringICmp (taxname, "Human immunodeficiency virus 1") == 0)) {
7702 StringCpy (taxname, "HIV-1");
7703 } else if ((StringICmp (taxname,"Human immunodeficiency virus type 2")==0) ||
7704 (StringICmp (taxname,"Human immunodeficiency virus 2")==0)) {
7705 StringCpy (taxname, "HIV-2");
7706 }
7707
7708 def = (CharPtr) ttl->data.ptrvalue;
7709 if (StringHasNoText (def)) return;
7710
7711 ptr = StringISearch (def, taxname);
7712 if (ptr != NULL && ptr == def) return;
7713 str = MemNew ((StringLen (taxname) + StringLen (def) + 4) * sizeof (Char));
7714 if (str == NULL) return;
7715 StringCpy (str, taxname);
7716 StringCat (str, " ");
7717 StringCat (str, def);
7718 ttl->data.ptrvalue = MemFree (ttl->data.ptrvalue);
7719 ttl->data.ptrvalue = str;
7720 }
7721
7722 static CharPtr onecomponent = "\
7723 Multiple sequence components are expected in this submission.\n\
7724 They should all be read in at the same time from the same file.";
7725
7726 static void OnlyOneComponentWarning (SequencesFormPtr sqfp)
7727
7728 {
7729 CharPtr type;
7730
7731 if (sqfp != NULL) {
7732 if (sqfp->seqPackage == SEQ_PKG_GENOMICCDNA
7733 || PackageTypeIsSingle (sqfp->seqPackage))
7734 {
7735 return;
7736 }
7737 switch (sqfp->seqPackage) {
7738 case SEQ_PKG_SEGMENTED :
7739 type = "segmented sequence";
7740 break;
7741 case SEQ_PKG_POPULATION :
7742 type = "population set";
7743 break;
7744 case SEQ_PKG_PHYLOGENETIC :
7745 type = "phylogenetic set";
7746 break;
7747 case SEQ_PKG_MUTATION :
7748 type = "mutation set";
7749 break;
7750 case SEQ_PKG_ENVIRONMENT :
7751 type = "environmental samples";
7752 break;
7753 case SEQ_PKG_GENBANK :
7754 type = "batch submission";
7755 break;
7756 default :
7757 type = "unknown set";
7758 break;
7759 }
7760 Message (MSG_OK, "WARNING - There is only one component in this %s.\n%s",
7761 type, onecomponent);
7762 }
7763 }
7764
7765 /*---------------------------------*/
7766 /* Parse the gene and gene-related */
7767 /* fields from the title. */
7768 /*---------------------------------*/
7769 extern void
7770 AddGeneFeatureFromTitle
7771 (SeqEntryPtr nucsep,
7772 CharPtr ttl,
7773 SeqLocPtr slp)
7774 {
7775 CharPtr gene = NULL;
7776 CharPtr gene_desc = NULL;
7777 CharPtr allele = NULL;
7778 CharPtr gene_syn = NULL;
7779 GeneRefPtr grp = NULL;
7780 SeqFeatPtr sfp;
7781 SeqIdPtr sip;
7782 BioseqPtr nbsp, bsp;
7783 SeqLocPtr gslp;
7784 Boolean hasNulls;
7785
7786 if (nucsep == NULL || !IS_Bioseq (nucsep)
7787 || (nbsp = (BioseqPtr) nucsep->data.ptrvalue) == NULL
7788 || StringHasNoText (ttl) || slp == NULL)
7789 {
7790 return;
7791 }
7792
7793 gene = FindValueFromPairInDefline ("gene", ttl);
7794 if (!StringHasNoText (gene))
7795 {
7796 gene_desc = StringChr (gene, ';');
7797 if (gene_desc != NULL) {
7798 *gene_desc = '\0';
7799 gene_desc++;
7800 allele = StringChr (gene_desc, ';');
7801 if (allele != NULL) {
7802 *allele = '\0';
7803 allele++;
7804 }
7805 }
7806 grp = CreateNewGeneRef (gene, allele, gene_desc, FALSE);
7807 }
7808 gene = MemFree (gene);
7809
7810 /*-----------------------------------------*/
7811 /* Parse the gene_syn field from the title */
7812 /*-----------------------------------------*/
7813
7814 gene_syn = FindValueFromPairInDefline ("gene_syn", ttl);
7815 if (!StringHasNoText (gene_syn))
7816 {
7817 if (grp == NULL) {
7818 grp = GeneRefNew ();
7819 }
7820 ValNodeCopyStr(&(grp->syn),0,gene_syn);
7821 }
7822 gene_syn = MemFree (gene_syn);
7823
7824 /* Create the gene feature */
7825 if (grp != NULL) {
7826 if (ExtendGene (grp, nucsep, slp)) {
7827 grp = GeneRefFree (grp);
7828 } else {
7829 sfp = CreateNewFeature (nucsep, NULL, SEQFEAT_GENE, NULL);
7830 if (sfp != NULL) {
7831 sfp->data.value.ptrvalue = (Pointer) grp;
7832 sfp->location = SeqLocFree (sfp->location);
7833 sfp->location = AsnIoMemCopy ((Pointer) slp,
7834 (AsnReadFunc) SeqLocAsnRead,
7835 (AsnWriteFunc) SeqLocAsnWrite);
7836 sip = SeqLocId (sfp->location);
7837 if (sip != NULL) {
7838 bsp = BioseqFind (sip);
7839 } else {
7840 bsp = nbsp;
7841 }
7842 if (bsp != NULL) {
7843 gslp = SeqLocMerge (bsp, sfp->location, NULL, TRUE, FALSE, FALSE);
7844 if (gslp != NULL) {
7845 sfp->location = SeqLocFree (sfp->location);
7846 sfp->location = gslp;
7847 if (bsp->repr == Seq_repr_seg) {
7848 gslp = SegLocToPartsEx (bsp, sfp->location, TRUE);
7849 sfp->location = SeqLocFree (sfp->location);
7850 sfp->location = gslp;
7851 hasNulls = LocationHasNullsBetween (sfp->location);
7852 sfp->partial = (sfp->partial || hasNulls);
7853 }
7854 FreeAllFuzz (gslp);
7855 }
7856 }
7857 }
7858 }
7859 RemoveValueFromDefline ("gene", ttl);
7860 RemoveValueFromDefline ("gene_syn", ttl);
7861 }
7862 }
7863
7864 extern SeqFeatPtr AddProteinFeatureFromDefline (SeqEntryPtr psep, CharPtr title)
7865 {
7866 CharPtr activity = NULL;
7867 CharPtr ec = NULL;
7868 CharPtr prot_name = NULL;
7869 CharPtr prot_desc = NULL;
7870 CharPtr other_prot_desc = NULL, tmp_desc;
7871 ProtRefPtr prp;
7872 SeqFeatPtr sfp = NULL;
7873
7874 if (psep == NULL)
7875 {
7876 return NULL;
7877 }
7878
7879 /*-----------------------------------------*/
7880 /* Parse the function field from the title */
7881 /*-----------------------------------------*/
7882
7883 activity = FindValueFromPairInDefline ("function", title);
7884
7885 /*------------------------------------------*/
7886 /* Parse the EC_number field from the title */
7887 /*------------------------------------------*/
7888
7889 ec = FindValueFromPairInDefline ("EC_number", title);
7890
7891 /*---------------------------------*/
7892 /* Parse the protein and prot_desc */
7893 /* fields from the title. */
7894 /*---------------------------------*/
7895
7896 prot_name = FindValueFromPairInDefline ("protein", title);
7897
7898 /*---------------------------------*/
7899 /* If we found a protein value ... */
7900 /*---------------------------------*/
7901 if (!StringHasNoText (prot_name))
7902 {
7903 /*----------------------------------------------*/
7904 /* ... search for a protein description, either */
7905 /* in the prot field (seperated by a ';') */
7906 /* or in its own 'prot_desc' field. */
7907 /*----------------------------------------------*/
7908
7909 prot_desc = StringChr (prot_name, ';');
7910 if (prot_desc != NULL)
7911 {
7912 *prot_desc = '\0';
7913 prot_desc++;
7914 /* ignore this description if empty */
7915 if (StringHasNoText (prot_desc))
7916 {
7917 prot_desc = NULL;
7918 }
7919 else
7920 {
7921 prot_desc = StringSave (prot_desc);
7922 }
7923 }
7924 }
7925 other_prot_desc = FindValueFromPairInDefline ("prot_desc", title);
7926 if (StringHasNoText (other_prot_desc))
7927 {
7928 other_prot_desc = MemFree (other_prot_desc);
7929 }
7930 else
7931 {
7932 if (prot_desc == NULL)
7933 {
7934 prot_desc = other_prot_desc;
7935 other_prot_desc = NULL;
7936 }
7937 else
7938 {
7939 tmp_desc = (CharPtr) MemNew ((StringLen (prot_desc) + StringLen (other_prot_desc) + 3)
7940 * sizeof (Char));
7941 if (tmp_desc != NULL)
7942 {
7943 StringCpy (tmp_desc, prot_desc);
7944 StringCat (tmp_desc, ";");
7945 StringCat (tmp_desc, other_prot_desc);
7946 prot_desc = MemFree (prot_desc);
7947 other_prot_desc = MemFree (other_prot_desc);
7948 prot_desc = tmp_desc;
7949 }
7950 }
7951 }
7952
7953 /*--------------------------------*/
7954 /* ... add the prot and prot_desc */
7955 /* to the Seq Features. */
7956 /*--------------------------------*/
7957
7958 prp = CreateNewProtRef (prot_name, prot_desc, ec, activity);
7959 if (prp != NULL)
7960 {
7961 sfp = CreateNewFeature (psep, NULL, SEQFEAT_PROT, NULL);
7962 if (sfp != NULL)
7963 {
7964 sfp->data.value.ptrvalue = (Pointer) prp;
7965 RemoveValueFromDefline ("protein", title);
7966 RemoveValueFromDefline ("prot_desc", title);
7967 RemoveValueFromDefline ("function", title);
7968 RemoveValueFromDefline ("EC_number", title);
7969 }
7970 }
7971 return sfp;
7972 }
7973
7974 extern void
7975 AddCodingRegionFieldsFromProteinTitle
7976 (CdRegionPtr crp,
7977 CharPtr title,
7978 CharPtr PNTR pcomment)
7979 {
7980 CharPtr comment, comment_loc, total_comment = NULL, tmp_comment;
7981
7982 if (crp == NULL || StringHasNoText (title))
7983 {
7984 return;
7985 }
7986
7987 /*---------------------*/
7988 /* Parse the ORF field */
7989 /*---------------------*/
7990 if (FindValuePairInDefLine ("orf", title, NULL) != NULL)
7991 {
7992 crp->orf = TRUE;
7993 RemoveValueFromDefline ("orf", title);
7994 }
7995
7996 if (pcomment == NULL)
7997 {
7998 return;
7999 }
8000
8001 /*-------------------------------*/
8002 /* Parse the comment/note fields */
8003 /*-------------------------------*/
8004 comment_loc = FindValuePairInDefLine ("comment", title, NULL);
8005 while (comment_loc != NULL)
8006 {
8007 comment = FindValueFromPairInDefline ("comment", comment_loc);
8008 if (!StringHasNoText (comment))
8009 {
8010 if (total_comment == NULL)
8011 {
8012 total_comment = comment;
8013 comment = NULL;
8014 }
8015 else
8016 {
8017 tmp_comment = (CharPtr) MemNew ((StringLen (total_comment) + StringLen (comment) + 3) * sizeof (Char));
8018 if (tmp_comment != NULL)
8019 {
8020 StringCpy (tmp_comment, total_comment);
8021 StringCat (tmp_comment, ";");
8022 StringCat (tmp_comment, comment);
8023 total_comment = MemFree (total_comment);
8024 total_comment = tmp_comment;
8025 }
8026 }
8027 }
8028 comment = MemFree (comment);
8029 RemoveValueFromDefline ("comment", title);
8030 comment_loc = FindValuePairInDefLine ("comment", title, NULL);
8031 }
8032
8033 *pcomment = total_comment;
8034 }
8035
8036 static void AutomaticMrnaProcess (SeqEntryPtr nucsep, SeqEntryPtr mrnasep, Boolean partial5, Boolean partial3)
8037
8038 {
8039 CharPtr mrna = NULL;
8040 CharPtr comment = NULL;
8041 BioseqPtr bsp;
8042 MolInfoPtr mip;
8043 BioseqPtr mrnabsp;
8044 BioseqPtr nucbsp;
8045 SeqLocPtr oldslp;
8046 RnaRefPtr rrp;
8047 SeqFeatPtr sfp;
8048 SeqIdPtr sip;
8049 SeqLocPtr slp;
8050 CharPtr ttl;
8051 ValNodePtr vnp;
8052
8053 if (nucsep == NULL || mrnasep == NULL) return;
8054 if (IS_Bioseq (nucsep) && IS_Bioseq (mrnasep)) {
8055 nucbsp = (BioseqPtr) nucsep->data.ptrvalue;
8056 mrnabsp = (BioseqPtr) mrnasep->data.ptrvalue;
8057 if (nucbsp == NULL || mrnabsp == NULL) return;
8058 slp = AlignmRNA2genomic (nucbsp, mrnabsp);
8059 if (slp == NULL) return;
8060 sip = SeqLocId (slp);
8061 if (sip != NULL) {
8062 bsp = BioseqFind (sip);
8063 if (bsp != NULL) {
8064 if (bsp->repr == Seq_repr_seg) {
8065 oldslp = slp;
8066 slp = SegLocToParts (bsp, oldslp);
8067 FreeAllFuzz (slp);
8068 SeqLocFree (oldslp);
8069 }
8070 }
8071 }
8072 StripLocusFromSeqLoc (slp);
8073 ttl = NULL;
8074 vnp = ValNodeFindNext (mrnabsp->descr, NULL, Seq_descr_title);
8075 if (vnp != NULL) {
8076 ttl = (CharPtr) vnp->data.ptrvalue;
8077 }
8078 if (ttl != NULL) {
8079 AddGeneFeatureFromTitle (nucsep, ttl, slp);
8080
8081 /* get mRNA name */
8082 mrna = FindValueFromPairInDefline ("mrna", ttl);
8083 RemoveValueFromDefline ("mrna", ttl);
8084 if (StringHasNoText (mrna))
8085 {
8086 mrna = MemFree (mrna);
8087 mrna = FindValueFromPairInDefline ("cdna", ttl);
8088 RemoveValueFromDefline ("cdna", ttl);
8089 }
8090 }
8091 rrp = RnaRefNew ();
8092 if (rrp != NULL) {
8093 rrp->type = 2;
8094 if (! StringHasNoText (mrna)) {
8095 rrp->ext.choice = 1;
8096 rrp->ext.value.ptrvalue = mrna;
8097 mrna = NULL;
8098 }
8099 sfp = CreateNewFeature (nucsep, NULL, SEQFEAT_RNA, NULL);
8100 if (sfp != NULL) {
8101 sfp->data.value.ptrvalue = (Pointer) rrp;
8102 sfp->location = SeqLocFree (sfp->location);
8103 sfp->location = AsnIoMemCopy ((Pointer) slp,
8104 (AsnReadFunc) SeqLocAsnRead,
8105 (AsnWriteFunc) SeqLocAsnWrite);
8106 SetSeqFeatProduct (sfp, mrnabsp);
8107 SetSeqLocPartial (sfp->location, partial5, partial3);
8108 sfp->partial = (sfp->partial || partial5 || partial3);
8109 if (ttl != NULL) {
8110 comment = FindValueFromPairInDefline ("comment", ttl);
8111 if (!StringHasNoText (comment)) {
8112 sfp->comment = comment;
8113 }
8114 else
8115 {
8116 comment = MemFree (comment);
8117 }
8118 RemoveValueFromDefline ("comment", ttl);
8119 }
8120 }
8121 }
8122 mrna = MemFree (mrna);
8123 SeqLocFree (slp);
8124 if (StringHasNoText (ttl)) {
8125 ValNodeExtract (&(mrnabsp->descr), Seq_descr_title);
8126 }
8127 mip = MolInfoNew ();
8128 if (mip != NULL) {
8129 mip->biomol = 3;
8130 if (partial5 && partial3) {
8131 mip->completeness = 5;
8132 } else if (partial5) {
8133 mip->completeness = 3;
8134 } else if (partial3) {
8135 mip->completeness = 4;
8136 }
8137 vnp = CreateNewDescriptor (mrnasep, Seq_descr_molinfo);
8138 if (vnp != NULL) {
8139 vnp->data.ptrvalue = (Pointer) mip;
8140 }
8141 }
8142 mrnabsp->mol = Seq_mol_rna;
8143 }
8144 }
8145
8146 static CharPtr LookForValueInBioseq (SeqEntryPtr sep, Uint1 mol, CharPtr valname)
8147 {
8148 BioseqPtr bsp;
8149 CharPtr title;
8150 ValNodePtr vnp;
8151
8152 if (sep == NULL || StringHasNoText (valname)) return FALSE;
8153 if (! IS_Bioseq (sep)) return FALSE;
8154 bsp = (BioseqPtr) sep->data.ptrvalue;
8155 if (bsp == NULL || bsp->mol != mol || bsp->descr == NULL) return FALSE;
8156 vnp = ValNodeFindNext (bsp->descr, NULL, Seq_descr_title);
8157 if (vnp == NULL || vnp->data.ptrvalue == NULL) return FALSE;
8158 title = (CharPtr) vnp->data.ptrvalue;
8159 return FindValueFromPairInDefline (valname, title);
8160 }
8161
8162 static void FindBioseqWithValue (SeqEntryPtr sep, Uint1 mol, CharPtr valname, CharPtr value, SeqEntryPtr PNTR rsult)
8163 {
8164 BioseqPtr bsp = NULL;
8165 BioseqSetPtr bssp = NULL;
8166 CharPtr match_value;
8167
8168 if (sep == NULL || sep->data.ptrvalue == NULL || rsult == NULL) return;
8169 if (IS_Bioseq (sep)) {
8170 bsp = (BioseqPtr) sep->data.ptrvalue;
8171 match_value = LookForValueInBioseq (sep, mol, valname);
8172 if (StringICmp (match_value, value))
8173 {
8174 *rsult = sep;
8175 }
8176 match_value = MemFree (match_value);
8177 } else if (IS_Bioseq_set (sep)) {
8178 bssp = (BioseqSetPtr) sep->data.ptrvalue;
8179 for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
8180 FindBioseqWithValue (sep, mol, valname, value, rsult);
8181 }
8182 }
8183 }
8184
8185 static void RemoveValueFromBioseq (SeqEntryPtr sep, CharPtr valname)
8186 {
8187 BioseqPtr bsp;
8188 ValNodePtr vnp;
8189
8190 if (sep == NULL) return;
8191 if (! IS_Bioseq (sep)) return;
8192 bsp = (BioseqPtr) sep->data.ptrvalue;
8193 if (bsp == NULL || bsp->descr == NULL) return;
8194 vnp = SeqEntryGetSeqDescr (sep, Seq_descr_title, NULL);
8195 if (vnp == NULL) return;
8196 RemoveValueFromDefline (valname, vnp->data.ptrvalue);
8197 if (StringHasNoText (vnp->data.ptrvalue)) {
8198 ValNodeExtract (&(bsp->descr), Seq_descr_title);
8199 }
8200 }
8201
8202 static SeqEntryPtr FindRnaByRefOnRna (SeqEntryPtr sep, SeqEntryPtr psep)
8203
8204 {
8205 SeqEntryPtr msep;
8206 CharPtr prot_name;
8207
8208 msep = NULL;
8209 if (sep == NULL || psep == NULL) return NULL;
8210 prot_name = LookForValueInBioseq (psep, Seq_mol_aa, "prot");
8211 if (!StringHasNoText (prot_name))
8212 {
8213 FindBioseqWithValue (sep, Seq_mol_rna, "prot", prot_name, &msep);
8214 RemoveValueFromBioseq (msep, "prot");
8215 }
8216 prot_name = MemFree (prot_name);
8217 return msep;
8218 }
8219
8220 static void FindRnaByName (SeqEntryPtr sep, CharPtr str, SeqEntryPtr PNTR msep)
8221
8222 {
8223 BioseqPtr bsp = NULL;
8224 BioseqSetPtr bssp = NULL;
8225 RnaRefPtr rrp;
8226 SeqAnnotPtr sap;
8227 SeqFeatPtr sfp;
8228
8229 if (sep == NULL || sep->data.ptrvalue == NULL) return;
8230 if (str == NULL || msep == NULL) return;
8231 if (IS_Bioseq (sep)) {
8232 bsp = (BioseqPtr) sep->data.ptrvalue;
8233 sap = bsp->annot;
8234 } else if (IS_Bioseq_set (sep)) {
8235 bssp = (BioseqSetPtr) sep->data.ptrvalue;
8236 sap = bssp->annot;
8237 } else return;
8238 while (sap != NULL) {
8239 if (sap->type == 1) {
8240 sfp = (SeqFeatPtr) sap->data;
8241 while (sfp != NULL) {
8242 if (sfp->data.choice == SEQFEAT_RNA) {
8243 rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
8244 if (rrp != NULL && rrp->type == 2 && rrp->ext.choice == 1 && sfp->product != NULL) {
8245 if (StringICmp (rrp->ext.value.ptrvalue, str) == 0) {
8246 bsp = BioseqFind (SeqLocId (sfp->product));
8247 if (bsp != NULL) {
8248 *msep = SeqMgrGetSeqEntryForData (bsp);
8249 }
8250 }
8251 }
8252 }
8253 sfp = sfp->next;
8254 }
8255 }
8256 sap = sap->next;
8257 }
8258 if (bssp != NULL) {
8259 for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
8260 FindRnaByName (sep, str, msep);
8261 }
8262 }
8263 }
8264
8265 static SeqEntryPtr FindRnaByRefOnProtein (SeqEntryPtr sep, SeqEntryPtr psep)
8266
8267 {
8268 SeqEntryPtr msep;
8269 CharPtr mrna_name;
8270
8271 msep = NULL;
8272 if (sep == NULL || psep == NULL) return NULL;
8273 mrna_name = LookForValueInBioseq (psep, Seq_mol_aa, "mrna");
8274 if (!StringHasNoText (mrna_name))
8275 {
8276 FindRnaByName (sep, mrna_name, &msep);
8277 RemoveValueFromBioseq (msep, "mrna");
8278 }
8279 mrna_name = MemFree (mrna_name);
8280 return msep;
8281 }
8282
8283 static void FindRnaByLocationOverlap (SeqEntryPtr sep, SeqLocPtr slp,
8284 Int4Ptr mindiff, SeqEntryPtr PNTR msep)
8285
8286 {
8287 BioseqPtr bsp = NULL;
8288 BioseqSetPtr bssp = NULL;
8289 Int4 diff;
8290 RnaRefPtr rrp;
8291 SeqAnnotPtr sap;
8292 SeqFeatPtr sfp;
8293
8294 if (sep == NULL || sep->data.ptrvalue == NULL) return;
8295 if (slp == NULL || mindiff == NULL || msep == NULL) return;
8296 if (IS_Bioseq (sep)) {
8297 bsp = (BioseqPtr) sep->data.ptrvalue;
8298 sap = bsp->annot;
8299 } else if (IS_Bioseq_set (sep)) {
8300 bssp = (BioseqSetPtr) sep->data.ptrvalue;
8301 sap = bssp->annot;
8302 } else return;
8303 while (sap != NULL) {
8304 if (sap->type == 1) {
8305 sfp = (SeqFeatPtr) sap->data;
8306 while (sfp != NULL) {
8307 if (sfp->data.choice == SEQFEAT_RNA) {
8308 rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
8309 if (rrp != NULL && rrp->type == 2 && sfp->product != NULL) {
8310 diff = SeqLocAinB (slp, sfp->location);
8311 if (diff >= 0) {
8312 if (diff < *mindiff) {
8313 bsp = BioseqFind (SeqLocId (sfp->product));
8314 if (bsp != NULL) {
8315 *mindiff = diff;
8316 *msep = SeqMgrGetSeqEntryForData (bsp);
8317 }
8318 }
8319 }
8320 }
8321 }
8322 sfp = sfp->next;
8323 }
8324 }
8325 sap = sap->next;
8326 }
8327 if (bssp != NULL) {
8328 for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
8329 FindRnaByLocationOverlap (sep, slp, mindiff, msep);
8330 }
8331 }
8332 }
8333
8334 static void FuseNucProtBiosources (SeqEntryPtr sep)
8335
8336 {
8337 BioSourcePtr biop1, biop2;
8338 BioseqPtr bsp;
8339 BioseqSetPtr bssp;
8340 ValNodePtr PNTR prev;
8341 ValNodePtr sdp1, sdp2;
8342 SeqEntryPtr tmp;
8343
8344 if (sep == NULL) return;
8345 if (! IS_Bioseq_set (sep)) return;
8346 bssp = (BioseqSetPtr) sep->data.ptrvalue;
8347 if (bssp == NULL || bssp->_class != BioseqseqSet_class_nuc_prot) return;
8348 tmp = FindNucSeqEntry (sep);
8349 if (tmp == NULL) return;
8350 if (! IS_Bioseq (tmp)) return;
8351 bsp = (BioseqPtr) tmp->data.ptrvalue;
8352 if (bsp == NULL) return;
8353 prev = &(bssp->descr);
8354 sdp1 = bssp->descr;
8355 while (sdp1 != NULL && sdp1->choice != Seq_descr_source) {
8356 prev = &(sdp1->next);
8357 sdp1 = sdp1->next;
8358 }
8359 if (sdp1 == NULL) return;
8360 sdp2 = SeqEntryGetSeqDescr (tmp, Seq_descr_source, NULL);
8361 if (sdp2 == NULL) return;
8362 biop1 = (BioSourcePtr) sdp1->data.ptrvalue;
8363 biop2 = (BioSourcePtr) sdp2->data.ptrvalue;
8364 if (CmpOrgById (biop1, biop2)) {
8365 *prev = sdp1->next;
8366 sdp1->next = NULL;
8367 SeqDescrFree (sdp1);
8368 }
8369 }
8370
8371 static void AssignOneProtein
8372 (SeqEntryPtr prot_sep,
8373 SequencesFormPtr sqfp,
8374 SeqEntryPtr assign_sep,
8375 SeqLocPtr use_this,
8376 BioseqPtr nucbsp,
8377 Int2 code,
8378 Boolean makeMRNA)
8379 {
8380 MolInfoPtr mip;
8381 SeqEntryPtr msep = NULL;
8382 BioseqPtr protbsp;
8383 SeqLocPtr slp;
8384 Int4 mindiff;
8385 Boolean partialN;
8386 Boolean partialC;
8387 ValNodePtr vnp;
8388
8389 if (prot_sep == NULL)
8390 {
8391 return;
8392 }
8393
8394 mip = MolInfoNew ();
8395 if (mip != NULL) {
8396 mip->biomol = 8;
8397 if (sqfp == NULL) {
8398 /* no technique */
8399 } else if (GetStatus (sqfp->protTechBoth)) {
8400 mip->tech = 10;
8401 } else {
8402 mip->tech = 13;
8403 }
8404 if (sqfp == NULL) {
8405 if (use_this == NULL) {
8406 partialN = FALSE;
8407 partialC = FALSE;
8408 } else {
8409 CheckSeqLocForPartial (use_this, &partialN, &partialC);
8410 }
8411 } else {
8412 partialN = GetStatus (sqfp->partialN);
8413 partialC = GetStatus (sqfp->partialC);
8414 }
8415 if (partialN && partialC) {
8416 mip->completeness = 5;
8417 } else if (partialN) {
8418 mip->completeness = 3;
8419 } else if (partialC) {
8420 mip->completeness = 4;
8421 }
8422 vnp = CreateNewDescriptor (prot_sep, Seq_descr_molinfo);
8423 if (vnp != NULL) {
8424 vnp->data.ptrvalue = (Pointer) mip;
8425 }
8426 }
8427 if (assign_sep != NULL) {
8428 if (sqfp != NULL && sqfp->seqPackage == SEQ_PKG_GENOMICCDNA) {
8429 ClearBatchSuggestNucleotide ();
8430 msep = FindRnaByRefOnProtein (assign_sep, prot_sep);
8431 if (msep == NULL) {
8432 msep = FindRnaByRefOnRna (assign_sep, prot_sep);
8433 }
8434 if (msep == NULL && nucbsp != NULL && IS_Bioseq (prot_sep)) {
8435 protbsp = (BioseqPtr) prot_sep->data.ptrvalue;
8436 if (protbsp != NULL) {
8437 slp = PredictCodingRegion (nucbsp, protbsp, code);
8438 if (slp != NULL) {
8439 mindiff = INT4_MAX;
8440 FindRnaByLocationOverlap (assign_sep, slp, &mindiff, &msep);
8441 }
8442 SeqLocFree (slp);
8443 }
8444 }
8445 }
8446 if (msep != NULL) {
8447 msep = GetBestTopParentForDataEx (ObjMgrGetEntityIDForChoice (msep),
8448 (BioseqPtr) msep->data.ptrvalue, TRUE);
8449 }
8450 if (msep == NULL) {
8451 msep = assign_sep;
8452 if (IS_Bioseq (msep))
8453 {
8454 msep = GetBestTopParentForDataEx (ObjMgrGetEntityIDForChoice (msep),
8455 (BioseqPtr) msep->data.ptrvalue, TRUE);
8456 }
8457 }
8458 AddSeqEntryToSeqEntry (msep, prot_sep, TRUE);
8459 AutomaticProteinProcess (msep, prot_sep, code, makeMRNA, use_this);
8460 } else {
8461 AutomaticProteinProcess (assign_sep, prot_sep, code, makeMRNA, use_this);
8462 }
8463 }
8464
8465 static SeqEntryPtr FindSeqEntryWithTranscriptID (SeqEntryPtr sep, CharPtr transcript_id)
8466 {
8467 SeqEntryPtr found_sep = NULL;
8468 BioseqPtr nbsp;
8469 SeqIdPtr sip, sip_next;
8470 CharPtr tmp;
8471 BioseqSetPtr bssp;
8472
8473 if (IS_Bioseq (sep))
8474 {
8475 nbsp = sep->data.ptrvalue;
8476 for (sip = nbsp->id; sip != NULL && found_sep == NULL; sip = sip_next)
8477 {
8478 sip_next = sip->next;
8479 sip->next = NULL;
8480 tmp = SeqIdWholeLabel (sip, PRINTID_REPORT);
8481 sip->next = sip_next;
8482 if (StringCmp (tmp, transcript_id) == 0)
8483 {
8484 found_sep = sep;
8485 }
8486 tmp = MemFree (tmp);
8487 }
8488 }
8489 else
8490 {
8491 bssp = (BioseqSetPtr) sep->data.ptrvalue;
8492 for (sep = bssp->seq_set; sep != NULL && found_sep == NULL; sep = sep->next)
8493 {
8494 found_sep = FindSeqEntryWithTranscriptID (sep, transcript_id);
8495 }
8496 }
8497 return found_sep;
8498 }
8499
8500 /* This section of code is used for matching up proteins to coding region locations
8501 * on the nucleotide sequences.
8502 */
8503
8504 /* A ValNode list will be used to hold the list of pairings between protein and nucleotide
8505 * sequences. There will be one ValNode per protein sequence. The choice for the ValNode
8506 * indicates the position of the nucleotide sequence in the set plus one - a zero indicates
8507 * that there is no nucleotide for this protein. The data.ptrvalue will be used to hold the
8508 * location of the coding region on the nucleotide.
8509 */
8510
8511 /* This function frees the AssociationList. */
8512 extern NucProtAssocPtr FreeAssociationList (NucProtAssocPtr assoc_list)
8513 {
8514 if (assoc_list == NULL)
8515 {
8516 return NULL;
8517 }
8518 assoc_list->next = FreeAssociationList (assoc_list->next);
8519 assoc_list->loc = SeqLocFree (assoc_list->loc);
8520 assoc_list = MemFree (assoc_list);
8521 return assoc_list;
8522 }
8523
8524 static NucProtAssocPtr NewAssociationList (NucProtAssocPtr PNTR assoc_list, Int4 position, SeqLocPtr loc)
8525 {
8526 NucProtAssocPtr last = NULL;
8527 NucProtAssocPtr new_assoc = (NucProtAssocPtr) MemNew (sizeof (NucProtAssocData));
8528
8529 if (assoc_list == NULL) {
8530 return NULL;
8531 }
8532 if (new_assoc != NULL) {
8533 new_assoc->position = position;
8534 new_assoc->loc = loc;
8535 new_assoc->next = NULL;
8536 if (*assoc_list == NULL) {
8537 *assoc_list = new_assoc;
8538 } else {
8539 last = *assoc_list;
8540 while (last->next != NULL) {
8541 last = last->next;
8542 }
8543 last->next = new_assoc;
8544 }
8545 }
8546 return *assoc_list;
8547 }
8548
8549 /* This function copies the AssociationList */
8550 static NucProtAssocPtr CopyAssociationList (NucProtAssocPtr orig_assoc_list)
8551 {
8552 NucProtAssocPtr copy_assoc_list = NULL;
8553
8554 if (orig_assoc_list == NULL)
8555 {
8556 return NULL;
8557 }
8558 copy_assoc_list = (NucProtAssocPtr) MemNew (sizeof (NucProtAssocData));
8559 if (copy_assoc_list != NULL)
8560 {
8561 copy_assoc_list->position = orig_assoc_list->position;
8562 copy_assoc_list->loc = SeqLocCopy (orig_assoc_list->loc);
8563 copy_assoc_list->next = CopyAssociationList (orig_assoc_list->next);
8564 }
8565
8566 return copy_assoc_list;
8567 }
8568
8569
8570 /* This function determines whether all proteins have been assigned to
8571 * nucleotide sequences.
8572 */
8573 static Boolean AllLocationsProvided (NucProtAssocPtr vnp)
8574 {
8575 if (vnp == NULL)
8576 {
8577 return FALSE;
8578 }
8579 while (vnp != NULL)
8580 {
8581 if (vnp->position == 0)
8582 {
8583 return FALSE;
8584 }
8585 vnp = vnp->next;
8586 }
8587 return TRUE;
8588 }
8589
8590 /* This function determines whether any proteins have been assigned to
8591 * nucleotide sequences.
8592 */
8593 static Boolean AnyLocationsProvided (NucProtAssocPtr vnp)
8594 {
8595 if (vnp == NULL)
8596 {
8597 return FALSE;
8598 }
8599 while (vnp != NULL)
8600 {
8601 if (vnp->position != 0)
8602 {
8603 return TRUE;
8604 }
8605 vnp = vnp->next;
8606 }
8607 return FALSE;
8608 }
8609
8610 /* Given a nucleotide-protein pair, this function calculates a coding region location
8611 * using Suggest Intervals. If no location is found, a location that includes the
8612 * entire sequence is returned instead.
8613 */
8614 static SeqLocPtr DefaultPairInterval (BioseqPtr nbsp, BioseqPtr pbsp, Int2 code)
8615 {
8616 SeqLocPtr slp;
8617 ErrSev oldsev;
8618 Char prot_str[3];
8619 Boolean partial5 = FALSE, partial3 = FALSE;
8620
8621 if (nbsp == NULL || pbsp == NULL)
8622 {
8623 return NULL;
8624 }
8625
8626 /* need to suppress errors */
8627 oldsev = ErrSetMessageLevel (SEV_MAX);
8628
8629 /* try to get location using SuggestIntervals */
8630 SetBatchSuggestNucleotide (nbsp, code);
8631 slp = PredictCodingRegion (nbsp, pbsp, code);
8632 ClearBatchSuggestNucleotide ();
8633
8634 ErrSetMessageLevel (oldsev);
8635
8636 /* if no location, use entire sequence */
8637 if (slp == NULL)
8638 {
8639 slp = SeqLocIntNew (0, nbsp->length - 1, Seq_strand_plus, nbsp->id);
8640 }
8641
8642 /* check for start and stop codons */
8643 SeqPortStreamInt (pbsp, 0, 1, Seq_strand_plus, EXPAND_GAPS_TO_DASHES, (Pointer) (prot_str), NULL);
8644 if (prot_str[0] != 'M') {
8645 partial5 = TRUE;
8646 }
8647
8648 if (SeqLocLen (slp) / 3 != pbsp->length + 1) {
8649 partial3 = TRUE;
8650 }
8651
8652 SetSeqLocPartial (slp, partial5, partial3);
8653
8654 return slp;
8655 }
8656
8657
8658 static Boolean FindFeaturesInIdenticalRegions (NucProtAssocPtr assoc_list)
8659 {
8660 Char path [PATH_MAX];
8661 FILE *fp;
8662 NucProtAssocPtr vnp;
8663 SeqFeatPtr sfp;
8664 SeqMgrFeatContext fcontext;
8665 Char id_txt [128];
8666 Boolean found_any = FALSE;
8667
8668 if (assoc_list == NULL)
8669 {
8670 return FALSE;
8671 }
8672
8673 TmpNam (path);
8674 fp = FileOpen (path, "wb");
8675
8676 for (vnp = assoc_list; vnp != NULL; vnp = vnp->next) {
8677 if (vnp->loc != NULL) {
8678 sfp = SeqMgrGetOverlappingCDS (vnp->loc, &fcontext);
8679 if (sfp != NULL && SeqLocCompare (vnp->loc, sfp->location) == SLC_A_EQ_B) {
8680 if (fp == NULL) {
8681 return TRUE;
8682 } else {
8683 found_any = TRUE;
8684 SeqIdWrite (SeqLocId (vnp->loc), id_txt, PRINTID_REPORT, sizeof (id_txt) - 1);
8685 fprintf (fp, "%s\n", id_txt);
8686 }
8687 }
8688 }
8689 }
8690 FileClose (fp);
8691
8692 if (found_any) {
8693 LaunchGeneralTextViewer (path, "Sequences with pre-existing Coding Regions");
8694 }
8695 FileRemove (path);
8696 return found_any;
8697 }
8698
8699
8700 static Int2 GetGeneticCodeFromBioseq (BioseqPtr bsp)
8701 {
8702 Int2 code = 1;
8703 SeqDescrPtr sdp;
8704 SeqMgrDescContext context;
8705 CharPtr location;
8706
8707 if (bsp != NULL) {
8708 sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_title, &context);
8709 if (sdp != NULL) {
8710 location = FindValueFromPairInDeflineBeforeCharPtr ("location", sdp->data.ptrvalue, NULL);
8711 if (!StringHasNoText (location)) {
8712 code = UseGeneticCodeForLocation (location);
8713 }
8714 }
8715 }
8716 return code;
8717 }
8718
8719
8720 /* This function takes a ValNode list where each ValNode represents
8721 * a protein in prot_list (in order). The choice for each ValNode
8722 * represents the position of the chosen nucleotide in the nuc_list
8723 * (position includes segments in segmented sets, which is why
8724 * FindNthSequenceInSet is used) plus one - zero indicates that there
8725 * is no nucleotide sequence for this protein.
8726 * The data.ptrvalue for the ValNode is to be populated with a
8727 * coding region SeqLoc, or NULL if there is no nucleotide for the protein.
8728 */
8729 static Boolean
8730 PickCodingRegionLocationsForProteinNucleotidePairs
8731 (NucProtAssocPtr assoc_list,
8732 SeqEntryPtr nuc_list,
8733 SeqEntryPtr prot_list)
8734 {
8735 NucProtAssocPtr vnp_assoc;
8736 Int4 data_row;
8737 BioseqPtr nbsp, pbsp;
8738 SeqLocPtr slp;
8739 Char path [PATH_MAX];
8740 FILE *fp;
8741 Boolean errors_found = FALSE;
8742 Char n_idstr[128];
8743 Char p_idstr[128];
8744 Int2 code;
8745
8746 if (assoc_list == NULL || nuc_list == NULL || prot_list == NULL)
8747 {
8748 return FALSE;
8749 }
8750
8751 TmpNam (path);
8752 fp = FileOpen (path, "wb");
8753
8754 vnp_assoc = assoc_list;
8755 for (data_row = 0, vnp_assoc = assoc_list;
8756 vnp_assoc != NULL;
8757 data_row++, vnp_assoc = vnp_assoc->next)
8758 {
8759 if (vnp_assoc->position > 0)
8760 {
8761 nbsp = FindNthSequenceInSet (nuc_list, vnp_assoc->position - 1, NULL, TRUE);
8762 pbsp = FindNthSequenceInSet (prot_list, data_row, NULL, FALSE);
8763 if (nbsp == NULL || pbsp == NULL) {
8764 slp = NULL;
8765 } else if ((nbsp->length +1) / 3 < pbsp->length) {
8766 if (fp != NULL) {
8767 SeqIdWrite (SeqIdFindWorst (nbsp->id), n_idstr, PRINTID_REPORT,
8768 sizeof (n_idstr));
8769 SeqIdWrite (SeqIdFindWorst (pbsp->id), p_idstr, PRINTID_REPORT,
8770 sizeof (p_idstr));
8771 fprintf (fp, "%s is too short to encode %s\n", n_idstr, p_idstr);
8772 }
8773 vnp_assoc->position = 0;
8774 errors_found = TRUE;
8775 slp = NULL;
8776 } else {
8777 code = GetGeneticCodeFromBioseq (nbsp);
8778 slp = DefaultPairInterval (nbsp, pbsp, code);
8779 }
8780 }
8781 else
8782 {
8783 slp = NULL;
8784 }
8785 vnp_assoc->loc = SeqLocFree (vnp_assoc->loc);
8786 vnp_assoc->loc = slp;
8787 }
8788
8789 FileClose (fp);
8790 if (errors_found) {
8791 LaunchGeneralTextViewer (path, "Nucleotide-Protein Mismatches");
8792 }
8793 FileRemove (path);
8794 return !errors_found;
8795 }
8796
8797
8798 static Int2 FindGeneticCodeForBioseq (BioseqPtr bsp, Int2 default_code)
8799 {
8800 Int2 code = default_code;
8801 BioSourcePtr biop;
8802 SeqEntryPtr nsep;
8803 BioseqSetPtr bssp;
8804 SeqDescrPtr sdp = NULL;
8805
8806 if (bsp == NULL) return default_code;
8807 nsep = GetBestTopParentForData (ObjMgrGetEntityIDForPointer (bsp), bsp);
8808 if (nsep == NULL || nsep->data.ptrvalue == NULL) return default_code;
8809 if (nsep->choice == 1)
8810 {
8811 bsp = nsep->data.ptrvalue;
8812 sdp = bsp->descr;
8813 }
8814 else if (nsep->choice == 2)
8815 {
8816 bssp = nsep->data.ptrvalue;
8817 sdp = bssp->descr;
8818 }
8819 while (sdp != NULL)
8820 {
8821 if (sdp->choice == Seq_descr_source && sdp->data.ptrvalue != NULL)
8822 {
8823 biop = (BioSourcePtr) sdp->data.ptrvalue;
8824 if (biop->org != NULL && biop->org->orgname != NULL)
8825 {
8826 code = BioSourceToGeneticCode (biop);
8827 }
8828 }
8829 sdp = sdp->next;
8830 }
8831 return code;
8832 }
8833
8834
8835 /* This function takes a ValNode list of coding region SeqLocs,
8836 * the list of nucleotide sequences, and the list of protein sequences
8837 * and creates the nuc-prot sets.
8838 */
8839 static void
8840 AssignProteinsToSelectedNucleotides
8841 (NucProtAssocPtr assoc_list,
8842 SeqEntryPtr nuc_list,
8843 SeqEntryPtr prot_list,
8844 SequencesFormPtr sqfp,
8845 Int2 code,
8846 Boolean makeMRNA)
8847 {
8848 SeqEntryPtr prot_sep, nsep, prot_next;
8849 NucProtAssocPtr vnp_assoc;
8850 BioseqPtr nbsp;
8851 BioseqPtr PNTR bsp_array;
8852 Int4 prot_num;
8853 ValNodePtr descr = NULL;
8854 Int2 genCode;
8855
8856 if (assoc_list == NULL || nuc_list == NULL || prot_list == NULL)
8857 {
8858 return;
8859 }
8860
8861 /* need to collect bioseqs before we start adding, otherwise the position in
8862 * the set changes */
8863
8864 bsp_array = (BioseqPtr PNTR) MemNew (ValNodeLen (prot_list) * sizeof (BioseqPtr));
8865 if (bsp_array == NULL)
8866 {
8867 return;
8868 }
8869
8870 for (prot_num = 0, vnp_assoc = assoc_list;
8871 vnp_assoc != NULL;
8872 prot_num++, vnp_assoc = vnp_assoc->next)
8873 {
8874 if (vnp_assoc->loc == NULL)
8875 {
8876 bsp_array [prot_num] = NULL;
8877 }
8878 else
8879 {
8880 bsp_array [prot_num] = FindNthSequenceInSet (nuc_list, vnp_assoc->position - 1, NULL, TRUE);
8881 }
8882 }
8883
8884 for (prot_sep = prot_list, vnp_assoc = assoc_list, prot_num = 0;
8885 prot_sep != NULL && vnp_assoc != NULL;
8886 prot_sep = prot_next, vnp_assoc = vnp_assoc->next, prot_num++)
8887 {
8888 prot_next = prot_sep->next;
8889 prot_sep->next = NULL;
8890
8891 if (vnp_assoc->loc == NULL)
8892 {
8893 /* discard protein */
8894 if (IS_Bioseq (prot_sep))
8895 {
8896 SeqMgrDeleteFromBioseqIndex (prot_sep->data.ptrvalue);
8897 }
8898 prot_sep = SeqEntryFree (prot_sep);
8899 }
8900 else
8901 {
8902 nbsp = bsp_array [prot_num];
8903 nsep = SeqMgrGetSeqEntryForData (nbsp);
8904 if (nbsp != NULL && nbsp->repr == Seq_repr_seg)
8905 {
8906 nsep = GetBestTopParentForData (ObjMgrGetEntityIDForPointer (nbsp), nbsp);
8907 }
8908 genCode = FindGeneticCodeForBioseq (nbsp, code);
8909 if (nsep != NULL && nsep->data.ptrvalue == nbsp) {
8910 descr = ExtractBioSourceAndPubs (nsep);
8911 }
8912 AssignOneProtein (prot_sep, sqfp, nsep, vnp_assoc->loc, nbsp,
8913 genCode, makeMRNA);
8914 if (descr != NULL) {
8915 ReplaceBioSourceAndPubs (nsep, descr);
8916 }
8917 vnp_assoc->loc = NULL; /*SeqLoc was freed in AssignOneProtein */
8918 }
8919 }
8920
8921 bsp_array = MemFree (bsp_array);
8922 }
8923
8924 /* This function creates a new protein ID based on the nucleotide ID that will be
8925 * unique within the record - nucleotide and protein sequence IDs are checked
8926 * for matches.
8927 */
8928 static CharPtr
8929 BuildProteinIDUniqueInIDAndTitleEdit
8930 (CharPtr nuc_id,
8931 IDAndTitleEditPtr iatep_nuc,
8932 IDAndTitleEditPtr iatep_prot)
8933 {
8934 CharPtr new_id, cp;
8935 Int4 offset, seq_num;
8936 Boolean unique_found = FALSE;
8937
8938 if (iatep_nuc == NULL || iatep_prot == NULL || StringHasNoText (nuc_id))
8939 {
8940 return NULL;
8941 }
8942
8943 new_id = (CharPtr) MemNew ((StringLen (nuc_id) + 20) * sizeof (Char));
8944 if (new_id != NULL)
8945 {
8946 StringCpy (new_id, nuc_id);
8947 StringCat (new_id, "_");
8948 cp = new_id + StringLen (new_id);
8949 for (offset = 1; offset < INT4_MAX && ! unique_found; offset ++)
8950 {
8951 sprintf (cp, "%d", offset);
8952 unique_found = TRUE;
8953 for (seq_num = 0; seq_num < iatep_nuc->num_sequences && unique_found; seq_num++)
8954 {
8955 if (StringCmp (iatep_nuc->id_list [seq_num], new_id) == 0)
8956 {
8957 unique_found = FALSE;
8958 }
8959 }
8960 for (seq_num = 0; seq_num < iatep_prot->num_sequences && unique_found; seq_num++)
8961 {
8962 if (StringCmp (iatep_prot->id_list [seq_num], new_id) == 0)
8963 {
8964 unique_found = FALSE;
8965 }
8966 }
8967 }
8968 }
8969 if (unique_found)
8970 {
8971 return new_id;
8972 }
8973 else
8974 {
8975 new_id = MemFree (new_id);
8976 return StringSave ("too_many");
8977 }
8978 }
8979
8980 static Boolean DoIdsMatch (CharPtr id1, CharPtr id2)
8981 {
8982 CharPtr tmp1, cp1 = NULL;
8983 CharPtr tmp2, cp2 = NULL;
8984 Boolean match = FALSE;
8985
8986 tmp1 = StringChr (id1, '|');
8987 if (tmp1 == NULL) {
8988 tmp1 = id1;
8989 } else if (tmp1 == id1 + 2) {
8990 tmp1++;
8991 cp1 = StringChr (tmp1, '|');
8992 if (cp1 != NULL) {
8993 *cp1 = 0;
8994 }
8995 }
8996
8997 tmp2 = StringChr (id2, '|');
8998 if (tmp2 == NULL) {
8999 tmp2 = id2;
9000 } else if (tmp2 == id2 + 2) {
9001 tmp2++;
9002 cp2 = StringChr (tmp2, '|');
9003 if (cp2 != NULL) {
9004 *cp2 = 0;
9005 }
9006 }
9007
9008 if (StringCmp (tmp1, tmp2) == 0)
9009 {
9010 match = TRUE;
9011 }
9012 if (cp1 != NULL) {
9013 *cp1 = '|';
9014 }
9015 if (cp2 != NULL) {
9016 *cp2 = '|';
9017 }
9018 return match;
9019 }
9020
9021
9022 /* if the user gave the protein sequences the same IDs as the nucleotide sequences,
9023 * we need to create new sequence IDs for the proteins so that they will be unique.
9024 * We should also make sure that sequence IDs that don't match nucleotide sequence
9025 * IDs are unique.
9026 */
9027 static void ReplaceDuplicateProteinIDs (SeqEntryPtr nuc_list, SeqEntryPtr prot_list)
9028 {
9029 Int4 nuc_seq_num, prot_seq_num, prot_seq_num_check;
9030 IDAndTitleEditPtr iatep_nuc, iatep_prot;
9031 Boolean found_nuc_match;
9032 CharPtr tmp_str, cp;
9033 BioseqPtr prot_bsp, nuc_bsp;
9034
9035 if (nuc_list == NULL || prot_list == NULL)
9036 {
9037 return;
9038 }
9039
9040 iatep_nuc = SeqEntryListToIDAndTitleEditEx (nuc_list, TRUE);
9041 iatep_prot = SeqEntryListToIDAndTitleEdit (prot_list);
9042 if (iatep_nuc != NULL && iatep_prot != NULL)
9043 {
9044 for (prot_seq_num = 0; prot_seq_num < iatep_prot->num_sequences; prot_seq_num++)
9045 {
9046 /* This part replaces any protein sequence IDs that match a nucleotide ID with
9047 * the nucleotide ID plus an underscore plus a number that makes the ID
9048 * unique.
9049 */
9050 found_nuc_match = FALSE;
9051 prot_bsp = FindNthSequenceInSet (prot_list, prot_seq_num, &(iatep_prot->is_seg[prot_seq_num]), FALSE);
9052 if (prot_bsp == NULL) continue;
9053 for (nuc_seq_num = 0;
9054 nuc_seq_num < iatep_nuc->num_sequences && ! found_nuc_match;
9055 nuc_seq_num++)
9056 {
9057 nuc_bsp = FindNthSequenceInSet (nuc_list, nuc_seq_num, &(iatep_nuc->is_seg[prot_seq_num]), TRUE);
9058 if (nuc_bsp == NULL) continue;
9059
9060 if (SeqIdIn (prot_bsp->id, nuc_bsp->id) || RelaxedSeqIdIn (prot_bsp->id, nuc_bsp->id)
9061 || DoIdsMatch (iatep_nuc->id_list [nuc_seq_num],
9062 iatep_prot->id_list [prot_seq_num])) {
9063 tmp_str = iatep_nuc->id_list [nuc_seq_num];
9064 cp = StringChr (tmp_str, '|');
9065 if (cp == tmp_str + 2) {
9066 tmp_str += 3;
9067 cp = StringChr (tmp_str, '|');
9068 }
9069 if (cp != NULL) {
9070 *cp = 0;
9071 }
9072
9073 iatep_prot->id_list [prot_seq_num] = MemFree (iatep_prot->id_list [prot_seq_num]);
9074 iatep_prot->id_list [prot_seq_num] = BuildProteinIDUniqueInIDAndTitleEdit (tmp_str,
9075 iatep_nuc,
9076 iatep_prot);
9077 if (cp != NULL) {
9078 *cp = '|';
9079 }
9080 found_nuc_match = TRUE;
9081 }
9082 }
9083 /* This part replaces a protein sequence ID that matches a previous protein
9084 * sequence ID with the original protein sequence ID plus an underscore plus
9085 * a number that makes the ID unique.
9086 */
9087 if (!found_nuc_match)
9088 {
9089 for (prot_seq_num_check = prot_seq_num + 1;
9090 prot_seq_num_check < iatep_prot->num_sequences;
9091 prot_seq_num_check ++)
9092 {
9093 if (StringCmp (iatep_prot->id_list [prot_seq_num],
9094 iatep_prot->id_list [prot_seq_num_check]) == 0)
9095 {
9096 tmp_str = iatep_prot->id_list [prot_seq_num_check];
9097 cp = StringChr (tmp_str, '|');
9098 if (cp == tmp_str + 2) {
9099 tmp_str += 3;
9100 cp = StringChr (tmp_str, '|');
9101 }
9102 if (cp != NULL) {
9103 *cp = 0;
9104 }
9105 tmp_str = StringSave (tmp_str);
9106
9107 iatep_prot->id_list [prot_seq_num_check] = MemFree (iatep_prot->id_list [prot_seq_num_check]);
9108 iatep_prot->id_list [prot_seq_num_check] = BuildProteinIDUniqueInIDAndTitleEdit (tmp_str,
9109 iatep_nuc,
9110 iatep_prot);
9111 tmp_str = MemFree (tmp_str);
9112 }
9113 }
9114 }
9115 }
9116 }
9117 ApplyIDAndTitleEditToSeqEntryList (prot_list, iatep_prot);
9118 iatep_prot = IDAndTitleEditFree (iatep_prot);
9119 iatep_nuc = IDAndTitleEditFree (iatep_nuc);
9120 }
9121
9122 static Uint2 nucprotedit_types [] = {
9123 TAGLIST_PROMPT, TAGLIST_PROMPT, TAGLIST_POPUP, TAGLIST_TEXT, TAGLIST_TEXT
9124 };
9125
9126 static Uint2 nucprotedit_widths [] = {
9127 5, 20, 10, 15, 15
9128 };
9129
9130 #define NUCPROTEDIT_NUCID_COLUMN 2
9131 #define NUCPROTEDIT_GENE_COLUMN 3
9132 #define NUCPROTEDIT_PROT_COLUMN 4
9133
9134 typedef struct nucprotedit
9135 {
9136 SeqEntryPtr nuc_list;
9137 SeqEntryPtr prot_list;
9138 DialoG dlg;
9139 ButtoN accept_btn;
9140 NucProtAssocPtr assoc_list;
9141 TexT all_gene_txt;
9142 TexT all_prot_txt;
9143 } NucProtEditData, PNTR NucProtEditPtr;
9144
9145 static void PopulateNucProtEdit (NucProtEditPtr npep)
9146 {
9147 IDAndTitleEditPtr iatep_nuc, iatep_prot;
9148 ValNodePtr row_list = NULL;
9149 NucProtAssocPtr vnp_assoc;
9150 TagListPtr tlp;
9151 CharPtr data_string, gene_locus, prot_name;
9152 Int4 data_len;
9153 Int4 prot_num;
9154 Int4 old_scroll_pos = 0;
9155
9156 if (npep == NULL)
9157 {
9158 return;
9159 }
9160
9161 tlp = (TagListPtr) GetObjectExtra (npep->dlg);
9162 if (tlp == NULL)
9163 {
9164 return;
9165 }
9166
9167 /* need to get bar value and reset after populating */
9168 if (tlp->bar != NULL)
9169 {
9170 old_scroll_pos = GetBarValue (tlp->bar);
9171 }
9172
9173 iatep_nuc = SeqEntryListToIDAndTitleEditEx (npep->nuc_list, TRUE);
9174 iatep_prot = SeqEntryListToIDAndTitleEdit (npep->prot_list);
9175 if (iatep_nuc != NULL && iatep_prot != NULL)
9176 {
9177 vnp_assoc = npep->assoc_list;
9178 for (prot_num = 0; prot_num < iatep_prot->num_sequences; prot_num++)
9179 {
9180 /* first column is protein ID */
9181 /* second column is choice for nucleotide ID */
9182 /* third column is gene locus tag */
9183 /* fourth column is protein name */
9184 /* fifth column indicates presence of suggested interval */
9185 gene_locus = FindValueFromPairInDefline ("gene", iatep_prot->title_list [prot_num]);
9186 prot_name = FindValueFromPairInDefline ("protein", iatep_prot->title_list [prot_num]);
9187
9188 data_len = StringLen (iatep_prot->id_list [prot_num])
9189 + 20
9190 + StringLen (gene_locus)
9191 + StringLen (prot_name);
9192 data_string = (CharPtr) MemNew (data_len * sizeof (Char));
9193 if (data_string != NULL)
9194 {
9195 sprintf (data_string, "%d\t%s\t%d\t%s\t%s\n",
9196 prot_num + 1,
9197 iatep_prot->id_list [prot_num],
9198 vnp_assoc == NULL ? 0 : vnp_assoc->position,
9199 gene_locus == NULL ? "" : gene_locus,
9200 prot_name == NULL ? "" : prot_name);
9201 ValNodeAddPointer (&row_list, 0, data_string);
9202 }
9203 gene_locus = MemFree (gene_locus);
9204 prot_name = MemFree (prot_name);
9205 if (vnp_assoc != NULL)
9206 {
9207 vnp_assoc = vnp_assoc->next;
9208 }
9209 }
9210 SendMessageToDialog (npep->dlg, VIB_MSG_RESET);
9211 tlp->vnp = row_list;
9212
9213 if (iatep_prot->num_sequences > tlp->rows)
9214 {
9215 tlp->max = MAX ((Int2) 0, (Int2) (iatep_prot->num_sequences - tlp->rows));
9216 CorrectBarMax (tlp->bar, tlp->max);
9217 CorrectBarPage (tlp->bar, tlp->rows - 1, tlp->rows - 1);
9218 Enable (tlp->bar);
9219 SetBarValue (tlp->bar, old_scroll_pos);
9220 }
9221 else
9222 {
9223 Hide (tlp->bar);
9224 }
9225 SendMessageToDialog (npep->dlg, VIB_MSG_REDRAW);
9226 }
9227
9228 iatep_nuc = IDAndTitleEditFree (iatep_nuc);
9229 iatep_prot = IDAndTitleEditFree (iatep_prot);
9230 }
9231
9232 static CharPtr
9233 GetTagListValueEx (TagListPtr tlp, Int4 seq_num, Int4 col_num);
9234
9235 static void ApplyGeneNameToAllSequences (ButtoN b)
9236 {
9237 NucProtEditPtr npep;
9238 CharPtr all_gene_name, new_val;
9239 TagListPtr tlp;
9240 Int4 seq_num;
9241 ValNodePtr vnp;
9242
9243 npep = (NucProtEditPtr) GetObjectExtra (b);
9244 if (npep == NULL)
9245 {
9246 return;
9247 }
9248
9249 tlp = (TagListPtr) GetObjectExtra (npep->dlg);
9250 if (tlp == NULL)
9251 {
9252 return;
9253 }
9254 all_gene_name = SaveStringFromText (npep->all_gene_txt);
9255 if (ANS_YES == Message (MSG_YN, "Are you sure you want to set all of the gene locus values to %s?",
9256 all_gene_name))
9257 {
9258 for (vnp = tlp->vnp, seq_num = 0;
9259 vnp != NULL;
9260 vnp = vnp->next, seq_num++)
9261 {
9262 new_val = ReplaceTagListColumn (vnp->data.ptrvalue, all_gene_name, NUCPROTEDIT_GENE_COLUMN);
9263 vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
9264 vnp->data.ptrvalue = new_val;
9265 }
9266 SendMessageToDialog (npep->dlg, VIB_MSG_REDRAW);
9267 }
9268 all_gene_name = MemFree (all_gene_name);
9269 }
9270
9271 static void ApplyProteinNameToAllSequences (ButtoN b)
9272 {
9273 NucProtEditPtr npep;
9274 CharPtr all_prot_name, new_val;
9275 TagListPtr tlp;
9276 Int4 seq_num;
9277 ValNodePtr vnp;
9278
9279 npep = (NucProtEditPtr) GetObjectExtra (b);
9280 if (npep == NULL)
9281 {
9282 return;
9283 }
9284
9285 tlp = (TagListPtr) GetObjectExtra (npep->dlg);
9286 if (tlp == NULL)
9287 {
9288 return;
9289 }
9290 all_prot_name = SaveStringFromText (npep->all_prot_txt);
9291 if (ANS_YES == Message (MSG_YN, "Are you sure you want to set all of the protein names to %s?",
9292 all_prot_name))
9293 {
9294 for (vnp = tlp->vnp, seq_num = 0;
9295 vnp != NULL;
9296 vnp = vnp->next, seq_num++)
9297 {
9298 new_val = ReplaceTagListColumn (vnp->data.ptrvalue, all_prot_name, NUCPROTEDIT_PROT_COLUMN);
9299 vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
9300 vnp->data.ptrvalue = new_val;
9301 }
9302 SendMessageToDialog (npep->dlg, VIB_MSG_REDRAW);
9303 }
9304 all_prot_name = MemFree (all_prot_name);
9305 }
9306
9307 static void ApplyNucProtEditGeneAndProt (NucProtEditPtr npep)
9308 {