|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/demo/cleanasn.c |
source navigation diff markup identifier search freetext search file search |
1 /* cleanasn.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: cleanasn.c
27 *
28 * Author: Jonathan Kans
29 *
30 * Version Creation Date: 10/19/99
31 *
32 * $Revision: 6.106 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45 #include <ncbi.h>
46 #include <objall.h>
47 #include <objsset.h>
48 #include <objfdef.h>
49 #include <objsub.h>
50 #include <sequtil.h>
51 #include <gather.h>
52 #include <sqnutils.h>
53 #include <explore.h>
54 #include <tofasta.h>
55 #include <toasn3.h>
56 #include <toporg.h>
57 #include <subutil.h>
58 #include <asn2gnbk.h>
59 #include <pmfapi.h>
60 #include <tax3api.h>
61 #include <asn2gnbi.h>
62 #include <ent2api.h>
63 #ifdef INTERNAL_NCBI_CLEANASN
64 #include <accpubseq.h>
65 #endif
66 #define NLM_GENERATED_CODE_PROTO
67 #include <objmacro.h>
68 #include <macroapi.h>
69
70 #define CLEANASN_APP_VER "4.2"
71
72 CharPtr CLEANASN_APPLICATION = CLEANASN_APP_VER;
73
74 typedef struct sums {
75 Int4 nucs;
76 Int4 prts;
77 Int4 recs;
78 } SumData, PNTR SumDataPtr;
79
80 typedef struct dbsums {
81 SumData genbank;
82 SumData embl;
83 SumData ddbj;
84 SumData refseq;
85 SumData other;
86 } DbSumData, PNTR DbSumPtr;
87
88 typedef struct counts {
89 Int4 auth;
90 Int4 bsec;
91 Int4 clnr;
92 Int4 gbbk;
93 Int4 modr;
94 Int4 move;
95 Int4 norm;
96 Int4 nucs;
97 Int4 okay;
98 Int4 othr;
99 Int4 pack;
100 Int4 prts;
101 Int4 publ;
102 Int4 recs;
103 Int4 sloc;
104 Int4 sort;
105 Int4 ssec;
106 Int4 titl;
107 } CountData, PNTR CountDataPtr;
108
109 typedef struct cleanflags {
110 Char buf [64];
111 Int4 gi;
112 Int2 year;
113 Boolean stripSerial;
114 Boolean isRefSeq;
115 Boolean batch;
116 Boolean binary;
117 Boolean compressed;
118 Int2 type;
119 CharPtr results;
120 CharPtr outfile;
121 CharPtr firstfile;
122 CharPtr lastfile;
123 Boolean foundfirst;
124 Boolean foundlast;
125 CharPtr sourcedb;
126 CharPtr report;
127 CharPtr selective;
128 ModType ffmode;
129 CharPtr ffdiff;
130 CharPtr asn2flat;
131 CharPtr asnval;
132 CharPtr clean;
133 CharPtr modernize;
134 CharPtr link;
135 CharPtr feat;
136 CharPtr desc;
137 CharPtr mods;
138 ValNodePtr action_list;
139 Boolean taxon;
140 Boolean pub;
141 Int4 unpubcount;
142 CountData rawcounts;
143 CountData cumcounts;
144 DbSumData dbsums;
145 AsnModulePtr amp;
146 AsnTypePtr atp_bss;
147 AsnTypePtr atp_bsss;
148 AsnTypePtr atp_se;
149 AsnTypePtr atp_bsc;
150 AsnTypePtr bssp_atp;
151 BioseqSet bss;
152 FILE *logfp;
153 } CleanFlagData, PNTR CleanFlagPtr;
154
155 static void RemoveFeatUser (
156 SeqFeatPtr sfp,
157 Pointer userdata
158 )
159
160 {
161 if (sfp == NULL) return;
162 if (sfp->ext != NULL) {
163 sfp->ext = UserObjectFree (sfp->ext);
164 }
165 }
166
167 static void RemoveFeatDbxref (
168 SeqFeatPtr sfp,
169 Pointer userdata
170 )
171
172 {
173 DbtagPtr dbt;
174 ValNodePtr next, vnp;
175
176 if (sfp == NULL) return;
177 for (vnp = sfp->dbxref; vnp != NULL; vnp = next) {
178 next = vnp->next;
179 dbt = (DbtagPtr) vnp->data.ptrvalue;
180 DbtagFree (dbt);
181 MemFree (vnp);
182 }
183 sfp->dbxref = NULL;
184 }
185
186 typedef struct dummysmfedata {
187 Int4 max;
188 Int4 num_at_max;
189 } DummySmfeData, PNTR DummySmfePtr;
190
191 static Boolean LIBCALLBACK CADummySMFEProc (
192 SeqFeatPtr sfp,
193 SeqMgrFeatContextPtr context
194 )
195
196
197 {
198 DummySmfePtr dsp;
199 Int4 len;
200
201 if (sfp == NULL || context == NULL) return TRUE;
202 dsp = context->userdata;
203 if (dsp == NULL) return TRUE;
204
205 len = SeqLocLen (sfp->location);
206 if (len < dsp->max) {
207 dsp->max = len;
208 dsp->num_at_max = 1;
209 } else if (len == dsp->max) {
210 (dsp->num_at_max)++;
211 }
212
213 return TRUE;
214 }
215
216 static void RemoveUnnecGeneXref (
217 SeqFeatPtr sfp,
218 Pointer userdata
219 )
220
221 {
222 Int2 count;
223 SeqFeatXrefPtr curr, next;
224 DummySmfeData dsd;
225 SeqMgrFeatContext fcontext;
226 SeqFeatXrefPtr PNTR last;
227 GeneRefPtr grp, grpx;
228 SeqFeatPtr sfpx;
229 CharPtr syn1, syn2;
230
231 if (sfp == NULL || sfp->data.choice == SEQFEAT_GENE) return;
232 grp = SeqMgrGetGeneXref (sfp);
233 if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return;
234 sfpx = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
235 if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE) return;
236 grpx = (GeneRefPtr) sfpx->data.value.ptrvalue;
237 if (grpx == NULL) return;
238
239 if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grpx->locus_tag)) {
240 if (StringICmp (grp->locus_tag, grpx->locus_tag) != 0) return;
241 } else if (StringDoesHaveText (grp->locus) && StringDoesHaveText (grpx->locus)) {
242 if (StringICmp (grp->locus, grpx->locus) != 0) return;
243 } else if (grp->syn != NULL && grpx->syn != NULL) {
244 syn1 = (CharPtr) grp->syn->data.ptrvalue;
245 syn2 = (CharPtr) grpx->syn->data.ptrvalue;
246 if (StringDoesHaveText (syn1) && StringDoesHaveText (syn2)) {
247 if (StringICmp (syn1, syn2) != 0) return;
248 }
249 }
250
251 MemSet ((Pointer) &dsd, 0, sizeof (DummySmfeData));
252 dsd.max = INT4_MAX;
253 dsd.num_at_max = 0;
254 count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_GENE,
255 NULL, 0, LOCATION_SUBSET,
256 (Pointer) &dsd, CADummySMFEProc);
257
258 if (dsd.num_at_max < 2) {
259 last = (SeqFeatXrefPtr PNTR) &(sfp->xref);
260 curr = sfp->xref;
261 while (curr != NULL) {
262 next = curr->next;
263 if (curr->data.choice == SEQFEAT_GENE) {
264 *last = next;
265 curr->next = NULL;
266 SeqFeatXrefFree (curr);
267 } else {
268 last = &(curr->next);
269 }
270 curr = next;
271 }
272 }
273 }
274
275 static void MarkTitles (
276 SeqDescrPtr sdp,
277 Pointer userdata
278 )
279
280 {
281 ObjValNodePtr ovn;
282
283 if (sdp == NULL || sdp->choice != Seq_descr_title) return;
284 if (sdp->extended == 0) return;
285 ovn = (ObjValNodePtr) sdp;
286 ovn->idx.deleteme = TRUE;
287 }
288
289 static void DoAutoDef (
290 SeqEntryPtr sep,
291 Uint2 entityID
292 )
293
294 {
295 ValNodePtr defline_clauses = NULL;
296 DeflineFeatureRequestList feature_requests;
297 Int4 index;
298 ValNodePtr modifier_indices = NULL;
299 ModifierItemLocalPtr modList;
300 OrganismDescriptionModifiers odmp;
301 SeqEntryPtr oldscope;
302
303 if (sep == NULL) return;
304 if (entityID < 1) return;
305
306 modList = MemNew (NumDefLineModifiers () * sizeof (ModifierItemLocalData));
307 if (modList == NULL) return;
308
309 InitFeatureRequests (&feature_requests);
310
311 SetRequiredModifiers (modList);
312 CountModifiers (modList, sep);
313
314 InitOrganismDescriptionModifiers (&odmp, sep);
315
316 RemoveNucProtSetTitles (sep);
317 oldscope = SeqEntrySetScope (sep);
318
319 BuildDefLineFeatClauseList (sep, entityID, &feature_requests,
320 DEFAULT_ORGANELLE_CLAUSE, FALSE, FALSE,
321 &defline_clauses);
322 if (AreFeatureClausesUnique (defline_clauses)) {
323 modifier_indices = GetModifierIndicesFromModList (modList);
324 } else {
325 modifier_indices = FindBestModifiers (sep, modList);
326 }
327
328 BuildDefinitionLinesFromFeatureClauseLists (defline_clauses, modList,
329 modifier_indices, &odmp);
330 DefLineFeatClauseListFree (defline_clauses);
331 if (modList != NULL) {
332 for (index = 0; index < NumDefLineModifiers (); index++) {
333 ValNodeFree (modList [index].values_seen);
334 }
335 MemFree (modList);
336 }
337 modifier_indices = ValNodeFree (modifier_indices);
338
339 ClearProteinTitlesInNucProts (entityID, NULL);
340 InstantiateProteinTitles (entityID, NULL);
341
342 SeqEntrySetScope (oldscope);
343 }
344
345 static void LookupPubdesc (
346 PubdescPtr pdp,
347 Pointer userdata
348 )
349
350 {
351 CitArtPtr cap;
352 MedlineEntryPtr mep;
353 PubmedEntryPtr pep;
354 Int4 pmid = 0;
355 ValNodePtr vnp;
356
357 if (pdp == NULL) return;
358
359 for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
360 switch (vnp->choice) {
361 case PUB_Muid :
362 /* ignore obsolete muids */
363 break;
364 case PUB_PMid :
365 pmid = vnp->data.intvalue;
366 break;
367 default :
368 /* return on real pub */
369 return;
370 break;
371 }
372 }
373
374 if (pmid == 0) return;
375
376 pep = GetPubMedForUid (pmid);
377 if (pep == NULL) return;
378 mep = (MedlineEntryPtr) pep->medent;
379 if (mep != NULL && mep->cit != NULL) {
380 cap = AsnIoMemCopy ((Pointer) mep->cit,
381 (AsnReadFunc) CitArtAsnRead,
382 (AsnWriteFunc) CitArtAsnWrite);
383 ValNodeAddPointer (&(pdp->pub), PUB_Article, (Pointer) cap);
384 }
385
386 PubmedEntryFree (pep);
387 }
388
389 static void CleanupLocation (
390 SeqFeatPtr sfp,
391 Pointer userdata
392 )
393
394 {
395 BioseqPtr bsp;
396 SeqIntPtr sintp;
397 SeqLocPtr slp;
398
399 if (sfp == NULL || sfp->location == NULL) return;
400
401 CleanUpSeqLoc (sfp->location);
402
403 if (sfp->data.choice == SEQFEAT_REGION ||
404 sfp->data.choice == SEQFEAT_SITE ||
405 sfp->data.choice == SEQFEAT_BOND ||
406 sfp->data.choice == SEQFEAT_PROT) {
407 bsp = BioseqFind (SeqLocId (sfp->location));
408 if (bsp != NULL && ISA_aa (bsp->mol)) {
409 slp = SeqLocFindNext (sfp->location, NULL);
410 while (slp != NULL) {
411 if (slp->choice == SEQLOC_INT) {
412 sintp = (SeqIntPtr) slp->data.ptrvalue;
413 if (sintp != NULL) {
414 if (sintp->strand != Seq_strand_unknown) {
415 sintp->strand = Seq_strand_unknown;
416 }
417 }
418 }
419 slp = SeqLocFindNext (sfp->location, slp);
420 }
421 }
422 }
423 }
424
425 static void CleanupMostRNAs (
426 SeqFeatPtr sfp,
427 Pointer userdata
428 )
429
430 {
431 RnaRefPtr rrp;
432
433 if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return;
434 rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
435 if (rrp == NULL || rrp->type == 255) return;
436
437 CleanUpSeqFeat (sfp, FALSE, FALSE, TRUE, FALSE, NULL);
438 }
439
440 static void CleanupRemainingRNAs (
441 SeqFeatPtr sfp,
442 Pointer userdata
443 )
444
445 {
446 if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return;
447
448 CleanUpSeqFeat (sfp, FALSE, FALSE, TRUE, FALSE, NULL);
449 }
450
451 static void CleanupPubAuthors (
452 PubdescPtr pdp,
453 Pointer userdata
454 )
455
456 {
457 if (pdp == NULL) return;
458
459 CleanUpPubdescAuthors (pdp);
460 }
461
462 static void CleanupPubBody (
463 PubdescPtr pdp,
464 Pointer userdata
465 )
466
467 {
468 CleanFlagPtr cfp;
469
470 if (pdp == NULL) return;
471 cfp = (CleanFlagPtr) userdata;
472 if (cfp == NULL) return;
473
474 CleanUpPubdescBody (pdp, cfp->stripSerial);
475 }
476
477 static void ModGenes (
478 SeqFeatPtr sfp,
479 Pointer userdata
480 )
481
482 {
483 ModernizeGeneFields (sfp);
484 }
485
486 static void ModRNAs (
487 SeqFeatPtr sfp,
488 Pointer userdata
489 )
490
491 {
492 ModernizeRNAFields (sfp);
493 }
494
495 static void ModPCRs (
496 BioSourcePtr biop,
497 Pointer userdata
498 )
499
500 {
501 ModernizePCRPrimers (biop);
502 }
503
504 static ByteStorePtr Se2Bs (
505 SeqEntryPtr sep
506 )
507
508 {
509 AsnIoBSPtr aibp;
510 ByteStorePtr bs;
511
512 if (sep == NULL) return NULL;
513
514 bs = BSNew (1000);
515 if (bs == NULL) return NULL;
516 aibp = AsnIoBSOpen ("w", bs);
517 if (aibp == NULL || aibp->aip == NULL) return NULL;
518
519 SeqEntryAsnWrite (sep, aibp->aip, NULL);
520
521 AsnIoFlush (aibp->aip);
522 AsnIoBSClose (aibp);
523
524 return bs;
525 }
526
527 static ByteStorePtr Se2BsX (
528 SeqEntryPtr sep
529 )
530
531 {
532 AsnIoBSPtr aibp;
533 ByteStorePtr bs;
534
535 if (sep == NULL) return NULL;
536
537 bs = BSNew (1000);
538 if (bs == NULL) return NULL;
539 aibp = AsnIoBSOpen ("w", bs);
540 if (aibp == NULL || aibp->aip == NULL) return NULL;
541
542 aibp->aip->asn_no_newline = TRUE;
543 aibp->aip->asn_alt_struct = TRUE;
544
545 SeqEntryAsnWrite (sep, aibp->aip, NULL);
546
547 AsnIoFlush (aibp->aip);
548 AsnIoBSClose (aibp);
549
550 return bs;
551 }
552
553 /*
554 static CharPtr Se2Str (
555 SeqEntryPtr sep
556 )
557
558 {
559 AsnIoBSPtr aibp;
560 ByteStorePtr bs;
561 CharPtr str;
562
563 if (sep == NULL) return NULL;
564
565 bs = BSNew (1000);
566 if (bs == NULL) return NULL;
567 aibp = AsnIoBSOpen ("w", bs);
568 if (aibp == NULL) return NULL;
569
570 SeqEntryAsnWrite (sep, aibp->aip, NULL);
571
572 AsnIoFlush (aibp->aip);
573 AsnIoBSClose (aibp);
574
575 str = BSMerge (bs, NULL);
576 BSFree (bs);
577
578 return str;
579 }
580 */
581
582 typedef struct chgdata {
583 Boolean isRefSeq;
584 Boolean sgml;
585 Boolean cdscodon;
586 Boolean rubisco;
587 Boolean rbc;
588 Boolean its;
589 Boolean rnaother;
590 Boolean trnanote;
591 Boolean oldbiomol;
592 Boolean oldgbqual;
593 Boolean badDbxref;
594 Boolean refDbxref;
595 Boolean srcDbxref;
596 Boolean capDbxref;
597 Boolean oldDbxref;
598 Boolean privDbxref;
599 Boolean multDbxref;
600 Boolean rareDbxref;
601 Boolean badOrg;
602 Boolean rpt_unit_seq;
603 Boolean hasUnpublished;
604 Boolean hasPublished;
605 Int4 protdesc;
606 Int4 sfpnote;
607 Int4 gbsource;
608 Int4 cdsconf;
609 } ChangeData, PNTR ChangeDataPtr;
610
611 static Boolean IsRubisco (
612 CharPtr name
613 )
614
615 {
616 return (StringICmp (name, "rubisco large subunit") == 0 ||
617 StringICmp (name, "rubisco small subunit") == 0);
618 }
619
620 static Boolean IsRbc (
621 CharPtr name
622 )
623
624 {
625 return (StringICmp (name, "RbcL") == 0 ||
626 StringICmp (name, "RbcS") == 0);
627 }
628
629 static Boolean IsITS (
630 CharPtr name
631 )
632
633 {
634 return (StringICmp (name, "its1") == 0 ||
635 StringICmp (name, "its 1") == 0 ||
636 StringICmp (name, "its2") == 0 ||
637 StringICmp (name, "its 2") == 0 ||
638 StringICmp (name, "its3") == 0 ||
639 StringICmp (name, "its 3") == 0 ||
640 StringICmp (name, "Ribosomal DNA internal transcribed spacer 1") == 0 ||
641 StringICmp (name, "Ribosomal DNA internal transcribed spacer 2") == 0 ||
642 StringICmp (name, "Ribosomal DNA internal transcribed spacer 3") == 0 ||
643 StringICmp (name, "internal transcribed spacer 1 (ITS1)") == 0 ||
644 StringICmp (name, "internal transcribed spacer 2 (ITS2)") == 0 ||
645 StringICmp (name, "internal transcribed spacer 3 (ITS3)") == 0);
646 }
647
648 static Boolean HasSgml (
649 CharPtr str
650 )
651
652 {
653 Int2 ascii_len;
654 Char buf [1024];
655
656 if (StringHasNoText (str)) return FALSE;
657
658 ascii_len = Sgml2AsciiLen (str);
659 if (ascii_len + 2 > sizeof (buf)) return FALSE;
660
661 Sgml2Ascii (str, buf, ascii_len + 1);
662 if (StringCmp (str, buf) != 0) {
663 return TRUE;
664 }
665
666 return FALSE;
667 }
668
669 static void LookForBadDbxref (
670 ValNodePtr list,
671 ChangeDataPtr cdp,
672 Boolean isSource
673 )
674
675 {
676 Boolean cap;
677 DbtagPtr dp;
678 CharPtr good;
679 ObjectIdPtr oip;
680 Boolean ref;
681 Boolean src;
682 CharPtr str;
683 ValNodePtr vnp;
684
685 if (list == NULL || cdp == NULL) return;
686
687 for (vnp = list; vnp != NULL; vnp = vnp->next) {
688 dp = (DbtagPtr) vnp->data.ptrvalue;
689 if (dp != NULL && StringDoesHaveText (dp->db)) {
690
691 oip = dp->tag;
692 if (oip != NULL && StringDoesHaveText (oip->str)) {
693 if (StringChr (oip->str, ':') != NULL) {
694 cdp->multDbxref = TRUE;
695 }
696 }
697
698 str = dp->db;
699 if (StringICmp (str, "PID") == 0 ||
700 StringICmp (str, "PIDg") == 0 ||
701 StringICmp (str, "PIDd") == 0 ||
702 StringICmp (str, "PIDe") == 0 ||
703 StringICmp (str, "NID") == 0 ||
704 StringICmp (str, "GI") == 0) {
705 cdp->privDbxref = TRUE;
706 continue;
707 }
708 if (StringICmp (str, "SWISS-PROT") == 0 ||
709 StringICmp (str, "SWISSPROT") == 0 ||
710 StringICmp (str, "SPTREMBL") == 0 ||
711 StringICmp (str, "SUBTILIS") == 0 ||
712 StringICmp (str, "MGD") == 0 ||
713 StringCmp (str, "cdd") == 0 ||
714 StringICmp (str, "TrEMBL") == 0 ||
715 StringICmp (str, "LocusID") == 0 ||
716 StringICmp (str, "MaizeDB") == 0 ||
717 StringICmp (str, "UniProt/Swiss-Prot") == 0 ||
718 StringICmp (str, "UniProt/TrEMBL") == 0 ||
719 StringICmp (str, "Genew") == 0 ||
720 StringICmp (str, "GENEDB") == 0 ||
721 StringICmp (str, "GreengenesID") == 0 ||
722 StringICmp (str, "HMPID") == 0 ||
723 StringICmp (str, "IFO") == 0 ||
724 StringICmp (str, "BHB") == 0 ||
725 StringICmp (str, "BioHealthBase") == 0) {
726 cdp->oldDbxref = TRUE;
727 continue;
728 }
729 if (StringICmp (str, "ATCC(dna)") == 0 ||
730 StringICmp (str, "ATCC(in host)") == 0 ||
731 StringICmp (str, "BDGP_EST") == 0 ||
732 StringICmp (str, "BDGP_INS") == 0 ||
733 StringICmp (str, "CGNC") == 0 ||
734 StringICmp (str, "CloneID") == 0 ||
735 StringICmp (str, "ENSEMBL") == 0 ||
736 StringICmp (str, "ESTLIB") == 0 ||
737 StringICmp (str, "GDB") == 0 ||
738 /*
739 StringICmp (str, "GOA") == 0 ||
740 */
741 StringICmp (str, "IMGT/HLA") == 0 ||
742 StringICmp (str, "PIR") == 0 ||
743 StringICmp (str, "PSEUDO") == 0 ||
744 StringICmp (str, "RZPD") == 0 ||
745 StringICmp (str, "SoyBase") == 0 ||
746 StringICmp (str, "UNILIB") == 0) {
747 cdp->rareDbxref = TRUE;
748 continue;
749 }
750 if (StringICmp (str, "MGD") == 0 || StringICmp (str, "MGI") == 0) {
751 oip = dp->tag;
752 if (oip != NULL && StringDoesHaveText (oip->str)) {
753 str = oip->str;
754 if (StringNICmp (str, "MGI:", 4) == 0 || StringNICmp (str, "MGD:", 4) == 0) {
755 cdp->oldDbxref = TRUE;
756 continue;
757 }
758 }
759 } else if (StringICmp (str, "HPRD") == 0) {
760 oip = dp->tag;
761 if (oip != NULL && StringDoesHaveText (oip->str)) {
762 str = oip->str;
763 if (StringNICmp (str, "HPRD_", 5) == 0) {
764 cdp->oldDbxref = TRUE;
765 continue;
766 }
767 }
768 }
769
770 if (isSource && StringCmp (str, "taxon") == 0) continue;
771
772 if (DbxrefIsValid (str, &ref, &src, &cap, &good)) {
773 if (ref && (! cdp->isRefSeq)) {
774 cdp->refDbxref = TRUE;
775 }
776 if (isSource && (! src)) {
777 cdp->srcDbxref = TRUE;
778 }
779 if (cap) {
780 cdp->capDbxref = TRUE;
781 }
782 } else {
783 cdp->badDbxref = TRUE;
784 }
785 }
786 }
787 }
788
789 static void ScoreFeature (
790 SeqFeatPtr sfp,
791 Pointer userdata
792 )
793
794 {
795 BioSourcePtr biop;
796 ChangeDataPtr cdp;
797 Char ch;
798 CharPtr comment;
799 CdRegionPtr crp;
800 CharPtr desc;
801 GBQualPtr gbq;
802 GeneRefPtr grp;
803 CharPtr name;
804 OrgRefPtr orp;
805 ProtRefPtr prp;
806 CharPtr ptr;
807 Uint1 residue;
808 RnaRefPtr rrp;
809 CharPtr str;
810 ValNodePtr vnp;
811
812 if (sfp == NULL) return;
813 cdp = (ChangeDataPtr) userdata;
814 if (cdp == NULL) return;
815
816 comment = sfp->comment;
817 if (StringDoesHaveText (comment)) {
818 (cdp->sfpnote)++;
819 }
820
821 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
822 if (StringCmp (gbq->qual, "partial") == 0 ||
823 StringCmp (gbq->qual, "evidence") == 0 ||
824 StringCmp (gbq->qual, "exception") == 0 ||
825 StringCmp (gbq->qual, "note") == 0 ||
826 StringCmp (gbq->qual, "notes") == 0 ||
827 StringCmp (gbq->qual, "comment") == 0 ||
828 StringCmp (gbq->qual, "db_xref") == 0 ||
829 StringCmp (gbq->qual, "gdb_xref") == 0 ||
830 StringCmp (gbq->qual, "rpt_unit") == 0 ||
831 StringCmp (gbq->qual, "pseudo") == 0 ||
832 StringCmp (gbq->qual, "gene") == 0 ||
833 StringCmp (gbq->qual, "codon_start") == 0 ||
834 StringCmp (gbq->qual, "transposon") == 0 ||
835 StringCmp (gbq->qual, "insertion_seq") == 0) {
836 cdp->oldgbqual = TRUE;
837 } else if (StringICmp (gbq->qual, "rpt_unit_seq") == 0) {
838 if (StringHasNoText (gbq->val)) continue;
839 ptr = gbq->val;
840 ch = *ptr;
841 while (ch != '\0') {
842 if (IS_UPPER (ch)) {
843 cdp->rpt_unit_seq = TRUE;
844 }
845 ptr++;
846 ch = *ptr;
847 }
848 }
849 }
850
851 LookForBadDbxref (sfp->dbxref, cdp, FALSE);
852
853 /* skip feature types that do not use data.value.ptrvalue */
854 switch (sfp->data.choice) {
855 case SEQFEAT_COMMENT:
856 case SEQFEAT_BOND:
857 case SEQFEAT_SITE:
858 case SEQFEAT_PSEC_STR:
859 return;
860 default:
861 break;
862 }
863
864 if (sfp->data.value.ptrvalue == NULL) return;
865
866 switch (sfp->data.choice) {
867 case SEQFEAT_GENE:
868 grp = (GeneRefPtr) sfp->data.value.ptrvalue;
869 if (HasSgml (grp->locus)) {
870 cdp->sgml = TRUE;
871 }
872 if (HasSgml (grp->desc)) {
873 cdp->sgml = TRUE;
874 }
875 for (vnp = grp->syn; vnp != NULL; vnp = vnp->next) {
876 str = (CharPtr) vnp->data.ptrvalue;
877 if (StringHasNoText (str)) continue;
878 if (HasSgml (str)) {
879 cdp->sgml = TRUE;
880 }
881 }
882 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
883 if (StringCmp (gbq->qual, "map") == 0 ||
884 StringCmp (gbq->qual, "allele") == 0 ||
885 StringCmp (gbq->qual, "locus_tag") == 0 ||
886 StringCmp (gbq->qual, "old_locus_tag") == 0) {
887 cdp->oldgbqual = TRUE;
888 }
889 }
890 LookForBadDbxref (grp->db, cdp, FALSE);
891 break;
892 case SEQFEAT_CDREGION:
893 crp = (CdRegionPtr) sfp->data.value.ptrvalue;
894 if (crp->conflict) {
895 (cdp->cdsconf)++;
896 }
897 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
898 if (StringCmp (gbq->qual, "codon") != 0) continue;
899 if (StringHasNoText (gbq->val)) continue;
900 cdp->cdscodon = TRUE;
901 }
902 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
903 if (StringCmp (gbq->qual, "product") == 0 ||
904 StringCmp (gbq->qual, "function") == 0 ||
905 StringCmp (gbq->qual, "EC_number") == 0 ||
906 StringCmp (gbq->qual, "prot_note") == 0) {
907 cdp->oldgbqual = TRUE;
908 }
909 }
910 break;
911 case SEQFEAT_PROT:
912 prp = (ProtRefPtr) sfp->data.value.ptrvalue;
913 desc = prp->desc;
914 if (StringDoesHaveText (desc)) {
915 (cdp->protdesc)++;
916 }
917 for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
918 str = (CharPtr) vnp->data.ptrvalue;
919 if (StringHasNoText (str)) continue;
920 if (IsRubisco (str)) {
921 cdp->rubisco = TRUE;
922 }
923 if (IsRbc (str)) {
924 cdp->rbc = TRUE;
925 }
926 }
927 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
928 if (StringCmp (gbq->qual, "product") == 0 ||
929 StringCmp (gbq->qual, "function") == 0 ||
930 StringCmp (gbq->qual, "EC_number") == 0 ||
931 StringCmp (gbq->qual, "label") == 0 ||
932 StringCmp (gbq->qual, "allele") == 0) {
933 cdp->oldgbqual = TRUE;
934 }
935 if (StringCmp (gbq->qual, "standard_name") == 0 && prp->name == NULL) {
936 cdp->oldgbqual = TRUE;
937 }
938 }
939 LookForBadDbxref (prp->db, cdp, FALSE);
940 break;
941 case SEQFEAT_RNA :
942 rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
943 if (rrp->type == 255 && rrp->ext.choice == 1) {
944 name = (CharPtr) rrp->ext.value.ptrvalue;
945 if (StringCmp (name, "misc_RNA") == 0) {
946 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
947 if (StringCmp (gbq->qual, "product") != 0) continue;
948 name = gbq->val;
949 if (StringHasNoText (name)) continue;
950 if (IsITS (name)) {
951 cdp->its = TRUE;
952 }
953 }
954 } else if (StringCmp (name, "ncRNA") == 0 || StringCmp (name, "tmRNA") == 0) {
955 } else {
956 cdp->rnaother = TRUE;
957 if (IsITS (name)) {
958 cdp->its = TRUE;
959 }
960 }
961 } else if (rrp->type == 3 && rrp->ext.choice == 2) {
962 if (StringDoesHaveText (comment)) {
963 if (StringNCmp (comment, "aa: ", 4) == 0) {
964 comment += 4;
965 }
966 residue = FindTrnaAA3 (comment);
967 if (residue > 0 && residue != 255) {
968 cdp->trnanote = TRUE;
969 }
970 residue = FindTrnaAA (comment);
971 if (residue > 0 && residue != 255) {
972 cdp->trnanote = TRUE;
973 }
974 }
975 }
976 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
977 if (StringCmp (gbq->qual, "product") == 0 ||
978 StringCmp (gbq->qual, "ncRNA_class") == 0 ||
979 StringCmp (gbq->qual, "tag_peptide") == 0) {
980 cdp->oldgbqual = TRUE;
981 }
982 }
983 break;
984 case SEQFEAT_ORG :
985 orp = (OrgRefPtr) sfp->data.value.ptrvalue;
986 LookForBadDbxref (orp->db, cdp, TRUE);
987 cdp->badOrg = TRUE;
988 break;
989 case SEQFEAT_BIOSRC :
990 biop = (BioSourcePtr) sfp->data.value.ptrvalue;
991 orp = biop->org;
992 if (orp != NULL) {
993 LookForBadDbxref (orp->db, cdp, TRUE);
994 }
995 default:
996 break;
997 }
998 }
999
1000 static void ScoreDescriptor (
1001 SeqDescrPtr sdp,
1002 Pointer userdata
1003 )
1004
1005 {
1006 BioSourcePtr biop;
1007 ChangeDataPtr cdp;
1008 GBBlockPtr gbp;
1009 MolInfoPtr mip;
1010 OrgRefPtr orp;
1011
1012 if (sdp == NULL) return;
1013 cdp = (ChangeDataPtr) userdata;
1014 if (cdp == NULL) return;
1015
1016 switch (sdp->choice) {
1017 case Seq_descr_genbank :
1018 gbp = (GBBlockPtr) sdp->data.ptrvalue;
1019 if (gbp != NULL) {
1020 if (StringDoesHaveText (gbp->source)) {
1021 (cdp->gbsource)++;
1022 }
1023 }
1024 break;
1025 case Seq_descr_molinfo :
1026 mip = (MolInfoPtr) sdp->data.ptrvalue;
1027 if (mip != NULL) {
1028 switch (mip->biomol) {
1029 case MOLECULE_TYPE_SNRNA:
1030 case MOLECULE_TYPE_SCRNA:
1031 case MOLECULE_TYPE_SNORNA:
1032 cdp->oldbiomol = TRUE;
1033 break;
1034 default :
1035 break;
1036 }
1037 }
1038 break;
1039 case Seq_descr_org :
1040 orp = (OrgRefPtr) sdp->data.ptrvalue;
1041 if (orp != NULL) {
1042 LookForBadDbxref (orp->db, cdp, TRUE);
1043 }
1044 cdp->badOrg = TRUE;
1045 break;
1046 case Seq_descr_source :
1047 biop = (BioSourcePtr) sdp->data.ptrvalue;
1048 if (biop != NULL) {
1049 orp = biop->org;
1050 if (orp != NULL) {
1051 LookForBadDbxref (orp->db, cdp, TRUE);
1052 }
1053 }
1054 break;
1055 default :
1056 break;
1057 }
1058 }
1059
1060 static void CheckForUnpubPub (
1061 PubdescPtr pdp,
1062 Pointer userdata
1063 )
1064
1065 {
1066 ChangeDataPtr cdp;
1067 CitGenPtr cgp;
1068 ValNodePtr vnp;
1069
1070 if (pdp == NULL) return;
1071 cdp = (ChangeDataPtr) userdata;
1072 if (cdp == NULL) return;
1073
1074 for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
1075 if (vnp->choice == PUB_Gen) {
1076 cgp = (CitGenPtr) vnp->data.ptrvalue;
1077 if (cgp != NULL) {
1078 if (StringICmp (cgp->cit, "Unpublished") == 0) {
1079 if (StringICmp (cgp->title, "Direct Submission") != 0) {
1080 cdp->hasUnpublished = TRUE;
1081 }
1082 }
1083 }
1084 } else if (vnp->choice == PUB_Muid || vnp->choice == PUB_PMid) {
1085 cdp->hasPublished = TRUE;
1086 } else if (vnp->choice == PUB_Article || vnp->choice == PUB_Book || vnp->choice == PUB_Man) {
1087 cdp->hasPublished = TRUE;
1088 }
1089 }
1090 }
1091
1092 static void CheckForChanges (
1093 SeqEntryPtr sep,
1094 ChangeDataPtr cdp
1095 )
1096
1097 {
1098 if (sep == NULL || cdp == NULL) return;
1099
1100 VisitFeaturesInSep (sep, (Pointer) cdp, ScoreFeature);
1101 VisitDescriptorsInSep (sep, (Pointer) cdp, ScoreDescriptor);
1102 VisitPubdescsInSep (sep, (Pointer) cdp, CheckForUnpubPub);
1103 }
1104
1105 static void StripBadProtTitles (
1106 BioseqPtr bsp,
1107 Pointer userdata
1108 )
1109
1110 {
1111 CharPtr buf;
1112 size_t buflen = 1001;
1113 ObjValNodePtr ovp;
1114 SeqIdPtr sip;
1115 CharPtr title;
1116 ValNodePtr vnp;
1117
1118 if (bsp == NULL) return;
1119 if (! ISA_aa (bsp->mol)) return;
1120 for (sip = bsp->id; sip != NULL; sip = sip->next) {
1121 if (sip->choice == SEQID_OTHER) return;
1122 }
1123
1124 vnp = BioseqGetSeqDescr (bsp, Seq_descr_title, NULL);
1125 if (vnp == NULL) return;
1126 title = (CharPtr) vnp->data.ptrvalue;
1127 if (StringHasNoText (title)) return;
1128
1129 buf = MemNew (sizeof (Char) * (buflen + 1));
1130 if (buf == NULL) return;
1131
1132 if (NewCreateDefLineBuf (NULL, bsp, buf, buflen, TRUE, FALSE)) {
1133 if (StringICmp (buf, title) != 0) {
1134 if (vnp->extended != 0) {
1135 ovp = (ObjValNodePtr) vnp;
1136 ovp->idx.deleteme = TRUE;
1137 }
1138 }
1139 }
1140
1141 MemFree (buf);
1142 }
1143
1144 static void BadProtTitleProc (
1145 SeqEntryPtr sep,
1146 Pointer mydata,
1147 Int4 index,
1148 Int2 indent
1149 )
1150
1151 {
1152 BioseqSetPtr bssp;
1153
1154 if (sep == NULL) return;
1155 if (! IS_Bioseq_set (sep)) return;
1156 bssp = (BioseqSetPtr) sep->data.ptrvalue;
1157 if (bssp->_class != BioseqseqSet_class_nuc_prot) return;
1158 VisitBioseqsInSep (sep, NULL, StripBadProtTitles);
1159 }
1160
1161 static void BSSaveToFile (
1162 ByteStorePtr bs,
1163 CharPtr path
1164 )
1165
1166 {
1167 Byte buf [256];
1168 Int4 count;
1169 FILE *fp;
1170
1171 if (bs == NULL || StringHasNoText (path)) return;
1172
1173 fp = FileOpen (path, "w");
1174 if (fp != NULL) {
1175 Nlm_BSSeek (bs, 0, SEEK_SET);
1176 count = BSRead (bs, buf, sizeof (buf));
1177 while (count > 0) {
1178 FileWrite (buf, count, 1, fp);
1179 count = BSRead (bs, buf, sizeof (buf));
1180 }
1181 FileClose (fp);
1182 }
1183 }
1184
1185 typedef struct diffblock {
1186 ValNodePtr head;
1187 ValNodePtr tail;
1188 } DiffBlock, PNTR DiffBlockPtr;
1189
1190 static void RecordDiffBlock (
1191 DiffBlockPtr dbp,
1192 CharPtr str
1193 )
1194
1195 {
1196 ValNodePtr vnp;
1197
1198 if (dbp == NULL || StringHasNoText (str)) return;
1199
1200 vnp = ValNodeCopyStr (&(dbp->tail), 0, str);
1201 if (dbp->head == NULL) {
1202 dbp->head = vnp;
1203 }
1204 dbp->tail = vnp;
1205 }
1206
1207 static void WriteDiffBlock (
1208 DiffBlockPtr dbp,
1209 FILE *fp
1210 )
1211
1212 {
1213 Char ch;
1214 Int2 idx;
1215 Int2 margin = INT2_MAX;
1216 CharPtr ptr;
1217 Int2 spaces;
1218 CharPtr str;
1219 ValNodePtr vnp;
1220
1221 if (dbp == NULL || dbp->head == NULL || fp == NULL) return;
1222
1223 for (vnp = dbp->head; vnp != NULL; vnp = vnp->next) {
1224 str = (CharPtr) vnp->data.ptrvalue;
1225 if (StringHasNoText (str)) continue;
1226 ch = str [0];
1227 if (ch == '<' || ch == '>') {
1228 ptr = str + 1;
1229 ch = *ptr;
1230 spaces = 0;
1231 while (ch == ' ') {
1232 spaces++;
1233 ptr++;
1234 ch = *ptr;
1235 }
1236 if (spaces < margin) {
1237 margin = spaces;
1238 }
1239 }
1240 }
1241
1242 if (margin > 80) {
1243 margin = 80;
1244 }
1245
1246 for (vnp = dbp->head; vnp != NULL; vnp = vnp->next) {
1247 str = (CharPtr) vnp->data.ptrvalue;
1248 if (StringHasNoText (str)) continue;
1249 ch = str [0];
1250 if (ch == '<' || ch == '>') {
1251 ptr = str + 1;
1252 ch = *ptr;
1253 idx = 0;
1254 while (idx < margin && ch == ' ') {
1255 idx++;
1256 ptr++;
1257 ch = *ptr;
1258 }
1259 fprintf (fp, "%c %s\n", str [0], ptr);
1260 } else if (ch == '-') {
1261 fprintf (fp, "---\n");
1262 } else if (ch == '=') {
1263 fprintf (fp, "===\n");
1264 }
1265 }
1266
1267 fprintf (fp, "\n");
1268 fflush (fp);
1269 }
1270
1271 static void ResetDiffBlock (
1272 DiffBlockPtr dbp
1273 )
1274
1275 {
1276 if (dbp == NULL) return;
1277
1278 dbp->head = ValNodeFreeData (dbp->head);
1279 dbp->tail = NULL;
1280 }
1281
1282 static void ReportAsnDiffs (
1283 FILE *logfp,
1284 CharPtr id,
1285 ByteStorePtr bs1,
1286 ByteStorePtr bs2)
1287
1288 {
1289 #ifdef OS_UNIX
1290 Char ch;
1291 Char cmmd [512];
1292 DiffBlock db;
1293 int diff;
1294 FileCache fc;
1295 FILE *fp;
1296 Char line [512];
1297 Char path1 [PATH_MAX];
1298 Char path2 [PATH_MAX];
1299 Char path3 [PATH_MAX];
1300 CharPtr str;
1301
1302 if (logfp == NULL || StringHasNoText (id)) return;
1303 if (bs1 == NULL || bs2 == NULL) return;
1304
1305 TmpNam (path1);
1306 TmpNam (path2);
1307 TmpNam (path3);
1308
1309 BSSaveToFile (bs1, path1);
1310 BSSaveToFile (bs2, path2);
1311
1312 db.head = NULL;
1313 db.tail = NULL;
1314
1315 sprintf (cmmd, "diff -b -h %s %s > %s", path1, path2, path3);
1316 diff = system (cmmd);
1317
1318 if (diff > 0) {
1319 fp = FileOpen (path3, "r");
1320 if (fp != NULL) {
1321 fprintf (logfp, "\n\n%s\n\n", id);
1322 fflush (logfp);
1323 if (FileCacheSetup (&fc, fp)) {
1324 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
1325 while (str != NULL) {
1326 ch = line [0];
1327 if (ch == '<' || ch == '>') {
1328 RecordDiffBlock (&db, line);
1329 } else if (ch == '-') {
1330 RecordDiffBlock (&db, "---");
1331 } else if (IS_DIGIT (ch)) {
1332 WriteDiffBlock (&db, logfp);
1333 ResetDiffBlock (&db);
1334 RecordDiffBlock (&db, "===");
1335 } else if (StringHasNoText (str)) {
1336 WriteDiffBlock (&db, logfp);
1337 ResetDiffBlock (&db);
1338 }
1339 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
1340 }
1341 WriteDiffBlock (&db, logfp);
1342 ResetDiffBlock (&db);
1343 }
1344 fprintf (logfp, "//\n\n");
1345 FileClose (fp);
1346 }
1347 }
1348
1349 sprintf (cmmd, "rm %s; rm %s; rm %s", path1, path2, path3);
1350 system (cmmd);
1351 #endif
1352 }
1353
1354 static void DoAsnDiffReport (
1355 SeqEntryPtr sep,
1356 CleanFlagPtr cfp
1357 )
1358
1359 {
1360 ByteStorePtr bs = NULL, tmp = NULL;
1361 Boolean okay = FALSE;
1362
1363 if (sep == NULL || cfp == NULL) return;
1364
1365 RemoveAllNcbiCleanupUserObjects (sep);
1366
1367 /* Capital letters avoid unwanted diffs on issues already fixed in ID */
1368
1369 if (StringChr (cfp->selective, 'A') != NULL) {
1370 VisitPubdescsInSep (sep, NULL, CleanupPubAuthors);
1371 }
1372 if (StringChr (cfp->selective, 'P') != NULL) {
1373 VisitPubdescsInSep (sep, (Pointer) cfp, CleanupPubBody);
1374 }
1375 if (StringChr (cfp->selective, 'L') != NULL) {
1376 VisitFeaturesInSep (sep, NULL, CleanupLocation);
1377 }
1378 if (StringChr (cfp->selective, 'R') != NULL) {
1379 VisitFeaturesInSep (sep, NULL, CleanupMostRNAs);
1380 VisitFeaturesInSep (sep, NULL, CleanupRemainingRNAs);
1381 VisitFeaturesInSep (sep, NULL, ModRNAs);
1382 }
1383 if (StringChr (cfp->selective, 'S') != NULL) {
1384 SortSeqEntryQualifiers (sep);
1385 }
1386 if (StringChr (cfp->selective, 'G') != NULL) {
1387 EntryChangeGBSource (sep);
1388 EntryCheckGBBlock (sep);
1389 }
1390 if (StringChr (cfp->selective, 'K') != NULL) {
1391 MoveFeatsFromPartsSet (sep);
1392 move_cds_ex (sep, TRUE);
1393 }
1394 if (StringChr (cfp->selective, 'M') != NULL) {
1395 SeqEntryPubsAsn4 (sep);
1396 }
1397
1398 NormalizeDescriptorOrder (sep);
1399
1400 /* Look for change in single issue */
1401
1402 bs = Se2BsX (sep);
1403 if (StringHasNoText (cfp->selective)) {
1404 okay = TRUE;
1405 } else if (StringChr (cfp->selective, 'a') != NULL) {
1406 VisitPubdescsInSep (sep, NULL, CleanupPubAuthors);
1407 tmp = Se2BsX (sep);
1408 if (! BSEqual (bs, tmp)) {
1409 okay = TRUE;
1410 }
1411 tmp = BSFree (tmp);
1412 } else if (StringChr (cfp->selective, 'p') != NULL) {
1413 VisitPubdescsInSep (sep, (Pointer) cfp, CleanupPubBody);
1414 tmp = Se2BsX (sep);
1415 if (! BSEqual (bs, tmp)) {
1416 okay = TRUE;
1417 }
1418 tmp = BSFree (tmp);
1419 } else if (StringChr (cfp->selective, 'l') != NULL) {
1420 VisitFeaturesInSep (sep, NULL, CleanupLocation);
1421 tmp = Se2BsX (sep);
1422 if (! BSEqual (bs, tmp)) {
1423 okay = TRUE;
1424 }
1425 tmp = BSFree (tmp);
1426 } else if (StringChr (cfp->selective, 'r') != NULL) {
1427 VisitFeaturesInSep (sep, NULL, CleanupMostRNAs);
1428 VisitFeaturesInSep (sep, NULL, CleanupRemainingRNAs);
1429 VisitFeaturesInSep (sep, NULL, ModRNAs);
1430 tmp = Se2BsX (sep);
1431 if (! BSEqual (bs, tmp)) {
1432 okay = TRUE;
1433 }
1434 tmp = BSFree (tmp);
1435 } else if (StringChr (cfp->selective, 's') != NULL) {
1436 SortSeqEntryQualifiers (sep);
1437 tmp = Se2BsX (sep);
1438 if (! BSEqual (bs, tmp)) {
1439 okay = TRUE;
1440 }
1441 tmp = BSFree (tmp);
1442 } else if (StringChr (cfp->selective, 'g') != NULL) {
1443 EntryChangeGBSource (sep);
1444 EntryCheckGBBlock (sep);
1445 tmp = Se2BsX (sep);
1446 if (! BSEqual (bs, tmp)) {
1447 okay = TRUE;
1448 }
1449 tmp = BSFree (tmp);
1450 } else if (StringChr (cfp->selective, 'k') != NULL) {
1451 MoveFeatsFromPartsSet (sep);
1452 move_cds_ex (sep, TRUE);
1453 tmp = Se2BsX (sep);
1454 if (! BSEqual (bs, tmp)) {
1455 okay = TRUE;
1456 }
1457 tmp = BSFree (tmp);
1458 } else if (StringChr (cfp->selective, 'm') != NULL) {
1459 SeqEntryPubsAsn4 (sep);
1460 NormalizeDescriptorOrder (sep);
1461 tmp = Se2BsX (sep);
1462 if (! BSEqual (bs, tmp)) {
1463 okay = TRUE;
1464 }
1465 tmp = BSFree (tmp);
1466 } else {
1467 okay = TRUE;
1468 }
1469
1470 /* Report incremental diff */
1471
1472 if (okay) {
1473 SeriousSeqEntryCleanup (sep, NULL, NULL);
1474 RemoveAllNcbiCleanupUserObjects (sep);
1475 NormalizeDescriptorOrder (sep);
1476 tmp = Se2BsX (sep);
1477 if (! BSEqual (bs, tmp)) {
1478 if (cfp->logfp != NULL) {
1479 ReportAsnDiffs (cfp->logfp, cfp->buf, bs, tmp);
1480 }
1481 }
1482 tmp = BSFree (tmp);
1483 }
1484
1485 BSFree (bs);
1486 }
1487
1488 static void DoASNReport (
1489 SeqEntryPtr sep,
1490 CleanFlagPtr cfp,
1491 Boolean dossec,
1492 Boolean quick
1493 )
1494
1495 {
1496 Boolean auth = FALSE, bsec = FALSE, clnr = FALSE, gbbk = FALSE,
1497 modr = FALSE, move = FALSE, norm = FALSE, othr = FALSE,
1498 pack = FALSE, publ = FALSE, ssec = FALSE, sloc = FALSE,
1499 sort = FALSE, titl = FALSE, ncbiusrobj = FALSE;
1500 ByteStorePtr bs = NULL, tmp = NULL;
1501 ChangeData cdbefore, cdafter;
1502 Uint2 entityID;
1503
1504 if (sep == NULL || cfp == NULL) return;
1505
1506 if (FindNcbiCleanupUserObject (sep) != NULL) {
1507 ncbiusrobj = TRUE;
1508 }
1509
1510 RemoveAllNcbiCleanupUserObjects (sep);
1511
1512 if (quick) {
1513 bs = Se2Bs (sep);
1514
1515 NormalizeDescriptorOrder (sep);
1516 tmp = Se2Bs (sep);
1517 if (! BSEqual (bs, tmp)) {
1518 norm = TRUE;
1519 }
1520 BSFree (bs);
1521 bs = tmp;
1522
1523 SeriousSeqEntryCleanup (sep, NULL, NULL);
1524 RemoveAllNcbiCleanupUserObjects (sep);
1525 NormalizeDescriptorOrder (sep);
1526 tmp = Se2Bs (sep);
1527 if (! BSEqual (bs, tmp)) {
1528 ssec = TRUE;
1529 }
1530 BSFree (bs);
1531 bs = tmp;
1532
1533 BSFree (bs);
1534
1535 if (ssec) {
1536 (cfp->rawcounts.ssec)++;
1537 (cfp->cumcounts.ssec)++;
1538 if (cfp->logfp != NULL) {
1539 fprintf (cfp->logfp, "SSEC %s\n", cfp->buf);
1540 fflush (cfp->logfp);
1541 }
1542 } else if (norm) {
1543 (cfp->rawcounts.norm)++;
1544 (cfp->cumcounts.norm)++;
1545 if (cfp->logfp != NULL) {
1546 fprintf (cfp->logfp, "NORM %s\n", cfp->buf);
1547 fflush (cfp->logfp);
1548 }
1549 } else {
1550 (cfp->rawcounts.okay)++;
1551 (cfp->cumcounts.okay)++;
1552 if (cfp->logfp != NULL) {
1553 fprintf (cfp->logfp, "OKAY %s\n", cfp->buf);
1554 fflush (cfp->logfp);
1555 }
1556 }
1557
1558 return;
1559 }
1560
1561 MemSet ((Pointer) &cdbefore, 0, sizeof (ChangeData));
1562 MemSet ((Pointer) &cdafter, 0, sizeof (ChangeData));
1563
1564 cdbefore.isRefSeq = cfp->isRefSeq;
1565 cdafter.isRefSeq = cfp->isRefSeq;
1566
1567 CheckForChanges (sep, &cdbefore);
1568
1569 bs = Se2Bs (sep);
1570
1571 NormalizeDescriptorOrder (sep);
1572 tmp = Se2Bs (sep);
1573 if (! BSEqual (bs, tmp)) {
1574 norm = TRUE;
1575 }
1576 BSFree (bs);
1577 bs = tmp;
1578
1579 VisitFeaturesInSep (sep, NULL, CleanupLocation);
1580 tmp = Se2Bs (sep);
1581 if (! BSEqual (bs, tmp)) {
1582 sloc = TRUE;
1583 }
1584 BSFree (bs);
1585 bs = tmp;
1586
1587 VisitFeaturesInSep (sep, NULL, CleanupMostRNAs);
1588 tmp = Se2Bs (sep);
1589 if (! BSEqual (bs, tmp)) {
1590 clnr = TRUE;
1591 }
1592 BSFree (bs);
1593 bs = tmp;
1594
1595 VisitFeaturesInSep (sep, NULL, CleanupRemainingRNAs);
1596 tmp = Se2Bs (sep);
1597 if (! BSEqual (bs, tmp)) {
1598 othr = TRUE;
1599 }
1600 BSFree (bs);
1601 bs = tmp;
1602
1603 VisitFeaturesInSep (sep, NULL, ModRNAs);
1604 tmp = Se2Bs (sep);
1605 if (! BSEqual (bs, tmp)) {
1606 modr = TRUE;
1607 }
1608 BSFree (bs);
1609 bs = tmp;
1610
1611 VisitPubdescsInSep (sep, NULL, CleanupPubAuthors);
1612 tmp = Se2Bs (sep);
1613 if (! BSEqual (bs, tmp)) {
1614 auth = TRUE;
1615 }
1616 BSFree (bs);
1617 bs = tmp;
1618
1619 VisitPubdescsInSep (sep, (Pointer) cfp, CleanupPubBody);
1620 tmp = Se2Bs (sep);
1621 if (! BSEqual (bs, tmp)) {
1622 publ = TRUE;
1623 }
1624 BSFree (bs);
1625 bs = tmp;
1626
1627 SortSeqEntryQualifiers (sep);
1628 tmp = Se2Bs (sep);
1629 if (! BSEqual (bs, tmp)) {
1630 sort = TRUE;
1631 }
1632 BSFree (bs);
1633 bs = tmp;
1634
1635 BasicSeqEntryCleanup (sep);
1636 tmp = Se2Bs (sep);
1637 if (! BSEqual (bs, tmp)) {
1638 bsec = TRUE;
1639 }
1640 BSFree (bs);
1641 bs = tmp;
1642
1643 EntryChangeGBSource (sep);
1644 EntryCheckGBBlock (sep);
1645 tmp = Se2Bs (sep);
1646 if (! BSEqual (bs, tmp)) {
1647 gbbk = TRUE;
1648 }
1649 BSFree (bs);
1650 bs = tmp;
1651
1652 entityID = ObjMgrGetEntityIDForChoice (sep);
1653 SeqMgrIndexFeatures (entityID, NULL);
1654 SeqEntryExplore (sep, NULL, BadProtTitleProc);
1655 DeleteMarkedObjects (0, OBJ_SEQENTRY, (Pointer) sep);
1656 SeqMgrIndexFeatures (entityID, NULL);
1657 InstantiateProteinTitles (entityID, NULL);
1658 SeqMgrClearFeatureIndexes (entityID, NULL);
1659 BasicSeqEntryCleanup (sep);
1660 NormalizeDescriptorOrder (sep);
1661 tmp = Se2Bs (sep);
1662 if (! BSEqual (bs, tmp)) {
1663 titl = TRUE;
1664 }
1665 BSFree (bs);
1666 bs = tmp;
1667
1668 MoveFeatsFromPartsSet (sep);
1669 move_cds_ex (sep, TRUE);
1670 tmp = Se2Bs (sep);
1671 if (! BSEqual (bs, tmp)) {
1672 pack = TRUE;
1673 }
1674 BSFree (bs);
1675 bs = tmp;
1676
1677 SeqEntryPubsAsn4 (sep);
1678 NormalizeDescriptorOrder (sep);
1679 tmp = Se2Bs (sep);
1680 if (! BSEqual (bs, tmp)) {
1681 move = TRUE;
1682 }
1683 BSFree (bs);
1684 bs = tmp;
1685
1686 if (dossec) {
1687 SeriousSeqEntryCleanup (sep, NULL, NULL);
1688 RemoveAllNcbiCleanupUserObjects (sep);
1689 NormalizeDescriptorOrder (sep);
1690 tmp = Se2Bs (sep);
1691 if (! BSEqual (bs, tmp)) {
1692 ssec = TRUE;
1693 }
1694 BSFree (bs);
1695 bs = tmp;
1696 }
1697
1698 BSFree (bs);
1699
1700 CheckForChanges (sep, &cdafter);
1701
1702 if (ssec) {
1703 (cfp->rawcounts.ssec)++;
1704 (cfp->cumcounts.ssec)++;
1705 if (cfp->logfp != NULL) {
1706 fprintf (cfp->logfp, "SSEC %s\n", cfp->buf);
1707 fflush (cfp->logfp);
1708 }
1709 } else if (move) {
1710 (cfp->rawcounts.move)++;
1711 (cfp->cumcounts.move)++;
1712 if (cfp->logfp != NULL) {
1713 fprintf (cfp->logfp, "MOVE %s\n", cfp->buf);
1714 fflush (cfp->logfp);
1715 }
1716 } else if (pack) {
1717 (cfp->rawcounts.pack)++;
1718 (cfp->cumcounts.pack)++;
1719 if (cfp->logfp != NULL) {
1720 fprintf (cfp->logfp, "PACK %s\n", cfp->buf);
1721 fflush (cfp->logfp);
1722 }
1723 } else if (titl) {
1724 (cfp->rawcounts.titl)++;
1725 (cfp->cumcounts.titl)++;
1726 if (cfp->logfp != NULL) {
1727 fprintf (cfp->logfp, "TITL %s\n", cfp->buf);
1728 fflush (cfp->logfp);
1729 }
1730 } else if (gbbk) {
1731 (cfp->rawcounts.gbbk)++;
1732 (cfp->cumcounts.gbbk)++;
1733 if (cfp->logfp != NULL) {
1734 fprintf (cfp->logfp, "GBBK %s\n", cfp->buf);
1735 fflush (cfp->logfp);
1736 }
1737 } else if (bsec) {
1738 (cfp->rawcounts.bsec)++;
1739 (cfp->cumcounts.bsec)++;
1740 if (cfp->logfp != NULL) {
1741 fprintf (cfp->logfp, "BSEC %s\n", cfp->buf);
1742 fflush (cfp->logfp);
1743 }
1744 } else if (sort) {
1745 (cfp->rawcounts.sort)++;
1746 (cfp->cumcounts.sort)++;
1747 if (cfp->logfp != NULL) {
1748 fprintf (cfp->logfp, "SORT %s\n", cfp->buf);
1749 fflush (cfp->logfp);
1750 }
1751 } else if (sloc) {
1752 (cfp->rawcounts.sloc)++;
1753 (cfp->cumcounts.sloc)++;
1754 if (cfp->logfp != NULL) {
1755 fprintf (cfp->logfp, "SLOC %s\n", cfp->buf);
1756 fflush (cfp->logfp);
1757 }
1758 } else if (clnr) {
1759 (cfp->rawcounts.clnr)++;
1760 (cfp->cumcounts.clnr)++;
1761 if (cfp->logfp != NULL) {
1762 fprintf (cfp->logfp, "CLNR %s\n", cfp->buf);
1763 fflush (cfp->logfp);
1764 }
1765 } else if (othr) {
1766 (cfp->rawcounts.othr)++;
1767 (cfp->cumcounts.othr)++;
1768 if (cfp->logfp != NULL) {
1769 fprintf (cfp->logfp, "OTHR %s\n", cfp->buf);
1770 fflush (cfp->logfp);
1771 }
1772 } else if (modr) {
1773 (cfp->rawcounts.modr)++;
1774 (cfp->cumcounts.modr)++;
1775 if (cfp->logfp != NULL) {
1776 fprintf (cfp->logfp, "MODR %s\n", cfp->buf);
1777 fflush (cfp->logfp);
1778 }
1779 } else if (publ) {
1780 (cfp->rawcounts.publ)++;
1781 (cfp->cumcounts.publ)++;
1782 if (cfp->logfp != NULL) {
1783 fprintf (cfp->logfp, "PUBL %s\n", cfp->buf);
1784 fflush (cfp->logfp);
1785 }
1786 } else if (auth) {
1787 (cfp->rawcounts.auth)++;
1788 (cfp->cumcounts.auth)++;
1789 if (cfp->logfp != NULL) {
1790 fprintf (cfp->logfp, "AUTH %s\n", cfp->buf);
1791 fflush (cfp->logfp);
1792 }
1793 } else if (norm) {
1794 (cfp->rawcounts.norm)++;
1795 (cfp->cumcounts.norm)++;
1796 if (cfp->logfp != NULL) {
1797 fprintf (cfp->logfp, "NORM %s\n", cfp->buf);
1798 fflush (cfp->logfp);
1799 }
1800 } else {
1801 (cfp->rawcounts.okay)++;
1802 (cfp->cumcounts.okay)++;
1803 if (cfp->logfp != NULL) {
1804 fprintf (cfp->logfp, "OKAY %s\n", cfp->buf);
1805 fflush (cfp->logfp);
1806 }
1807 }
1808
1809 if (cdbefore.oldgbqual) {
1810 if (cfp->logfp != NULL) {
1811 fprintf (cfp->logfp, "GBQ %s\n", cfp->buf);
1812 fflush (cfp->logfp);
1813 }
1814 }
1815 if (cdbefore.sgml) {
1816 if (cfp->logfp != NULL) {
1817 fprintf (cfp->logfp, "SGM %s\n", cfp->buf);
1818 fflush (cfp->logfp);
1819 }
1820 }
1821 if (cdbefore.cdscodon) {
1822 if (cfp->logfp != NULL) {
1823 fprintf (cfp->logfp, "CDN %s\n", cfp->buf);
1824 fflush (cfp->logfp);
1825 }
1826 }
1827 if (cdbefore.rubisco) {
1828 if (cfp->logfp != NULL) {
1829 fprintf (cfp->logfp, "RUB %s\n", cfp->buf);
1830 fflush (cfp->logfp);
1831 }
1832 }
1833 if (cdbefore.rbc) {
1834 if (cfp->logfp != NULL) {
1835 fprintf (cfp->logfp, "RBC %s\n", cfp->buf);
1836 fflush (cfp->logfp);
1837 }
1838 }
1839 if (cdbefore.its) {
1840 if (cfp->logfp != NULL) {
1841 fprintf (cfp->logfp, "ITS %s\n", cfp->buf);
1842 fflush (cfp->logfp);
1843 }
1844 }
1845 if (cdbefore.rnaother) {
1846 if (cfp->logfp != NULL) {
1847 fprintf (cfp->logfp, "RNA %s\n", cfp->buf);
1848 fflush (cfp->logfp);
1849 }
1850 }
1851 if (cdbefore.trnanote) {
1852 if (cfp->logfp != NULL) {
1853 fprintf (cfp->logfp, "TRN %s\n", cfp->buf);
1854 fflush (cfp->logfp);
1855 }
1856 }
1857 if (cdbefore.oldbiomol) {
1858 if (cfp->logfp != NULL) {
1859 fprintf (cfp->logfp, "MOL %s\n", cfp->buf);
1860 fflush (cfp->logfp);
1861 }
1862 }
1863 if (cdbefore.badOrg) {
1864 if (cfp->logfp != NULL) {
1865 fprintf (cfp->logfp, "ORG %s\n", cfp->buf);
1866 fflush (cfp->logfp);
1867 }
1868 }
1869 if (cdbefore.rpt_unit_seq) {
1870 if (cfp->logfp != NULL) {
1871 fprintf (cfp->logfp, "RUS %s\n", cfp->buf);
1872 fflush (cfp->logfp);
1873 }
1874 }
1875
1876 if (cdbefore.badDbxref) {
1877 if (cfp->logfp != NULL) {
1878 fprintf (cfp->logfp, "BDX %s\n", cfp->buf);
1879 fflush (cfp->logfp);
1880 }
1881 }
1882 if (cdbefore.refDbxref) {
1883 if (cfp->logfp != NULL) {
1884 fprintf (cfp->logfp, "FDX %s\n", cfp->buf);
1885 fflush (cfp->logfp);
1886 }
1887 }
1888 if (cdbefore.srcDbxref) {
1889 if (cfp->logfp != NULL) {
1890 fprintf (cfp->logfp, "SDX %s\n", cfp->buf);
1891 fflush (cfp->logfp);
1892 }
1893 }
1894 if (cdbefore.capDbxref) {
1895 if (cfp->logfp != NULL) {
1896 fprintf (cfp->logfp, "CDX %s\n", cfp->buf);
1897 fflush (cfp->logfp);
1898 }
1899 }
1900 if (cdbefore.privDbxref) {
1901 if (cfp->logfp != NULL) {
1902 fprintf (cfp->logfp, "PDX %s\n", cfp->buf);
1903 fflush (cfp->logfp);
1904 }
1905 }
1906 if (cdbefore.oldDbxref) {
1907 if (cfp->logfp != NULL) {
1908 fprintf (cfp->logfp, "ODX %s\n", cfp->buf);
1909 fflush (cfp->logfp);
1910 }
1911 }
1912 if (cdbefore.multDbxref) {
1913 if (cfp->logfp != NULL) {
1914 fprintf (cfp->logfp, "MDX %s\n", cfp->buf);
1915 fflush (cfp->logfp);
1916 }
1917 }
1918 if (cdbefore.rareDbxref) {
1919 if (cfp->logfp != NULL) {
1920 fprintf (cfp->logfp, "RDX %s\n", cfp->buf);
1921 fflush (cfp->logfp);
1922 }
1923 }
1924 if (cdafter.hasUnpublished && ! cdafter.hasPublished) {
1925 if (cfp->logfp != NULL) {
1926 fprintf (cfp->logfp, "UNP %s\n", cfp->buf);
1927 fflush (cfp->logfp);
1928 }
1929 }
1930
1931 if (sort) {
1932 if (cfp->logfp != NULL) {
1933 fprintf (cfp->logfp, "SRT %s\n", cfp->buf);
1934 fflush (cfp->logfp);
1935 }
1936 }
1937 if (sloc) {
1938 if (cfp->logfp != NULL) {
1939 fprintf (cfp->logfp, "SLC %s\n", cfp->buf);
1940 fflush (cfp->logfp);
1941 }
1942 }
1943 if (clnr) {
1944 if (cfp->logfp != NULL) {
1945 fprintf (cfp->logfp, "RCN %s\n", cfp->buf);
1946 fflush (cfp->logfp);
1947 }
1948 }
1949 if (othr) {
1950 if (cfp->logfp != NULL) {
1951 fprintf (cfp->logfp, "RNO %s\n", cfp->buf);
1952 fflush (cfp->logfp);
1953 }
1954 }
1955 if (modr) {
1956 if (cfp->logfp != NULL) {
1957 fprintf (cfp->logfp, "RMD %s\n", cfp->buf);
1958 fflush (cfp->logfp);
1959 }
1960 }
1961 if (publ) {
1962 if (cfp->logfp != NULL) {
1963 fprintf (cfp->logfp, "PUB %s\n", cfp->buf);
1964 fflush (cfp->logfp);
1965 }
1966 }
1967 if (auth) {
1968 if (cfp->logfp != NULL) {
1969 fprintf (cfp->logfp, "ATH %s\n", cfp->buf);
1970 fflush (cfp->logfp);
1971 }
1972 }
1973 if (pack) {
1974 if (cfp->logfp != NULL) {
1975 fprintf (cfp->logfp, "PKG %s\n", cfp->buf);
1976 fflush (cfp->logfp);
1977 }
1978 }
1979 if (move) {
1980 if (cfp->logfp != NULL) {
1981 fprintf (cfp->logfp, "MVP %s\n", cfp->buf);
1982 fflush (cfp->logfp);
1983 }
1984 }
1985 if (titl) {
1986 if (cfp->logfp != NULL) {
1987 fprintf (cfp->logfp, "TTL %s\n", cfp->buf);
1988 fflush (cfp->logfp);
1989 }
1990 }
1991
1992 if (cdbefore.protdesc != cdafter.protdesc) {
1993 if (cfp->logfp != NULL) {
1994 fprintf (cfp->logfp, "PRT %s\n", cfp->buf);
1995 fflush (cfp->logfp);
1996 }
1997 }
1998 if (cdbefore.sfpnote != cdafter.sfpnote) {
1999 if (cfp->logfp != NULL) {
2000 fprintf (cfp->logfp, "COM %s\n", cfp->buf);
2001 fflush (cfp->logfp);
2002 }
2003 }
2004 if (cdbefore.gbsource != cdafter.gbsource) {
2005 if (cfp->logfp != NULL) {
2006 fprintf (cfp->logfp, "SRC %s\n", cfp->buf);
2007 fflush (cfp->logfp);
2008 }
2009 }
2010 if (cdbefore.cdsconf != cdafter.cdsconf) {
2011 if (cfp->logfp != NULL) {
2012 fprintf (cfp->logfp, "CNF %s\n", cfp->buf);
2013 fflush (cfp->logfp);
2014 }
2015 }
2016
2017 if (ncbiusrobj) {
2018 if (cfp->logfp != NULL) {
2019 fprintf (cfp->logfp, "USR %s\n", cfp->buf);
2020 fflush (cfp->logfp);
2021 }
2022 }
2023 }
2024
2025 static CharPtr ffmod [] = {
2026 "",
2027 "release",
2028 "entrez",
2029 "gbench",
2030 "dump",
2031 NULL
2032 };
2033
2034 static void DoGBFFReport (
2035 SeqEntryPtr sep,
2036 CleanFlagPtr cfp,
2037 Int2 batch
2038 )
2039
2040 {
2041 #ifdef OS_UNIX
2042 AsnIoPtr aip;
2043 Char arguments [128];
2044 BioseqPtr bsp;
2045 Char ch;
2046 Char cmmd [512];
2047 int diff;
2048 FileCache fc;
2049 FILE *fp;
2050 SeqEntryPtr fsep;
2051 Char line [512];
2052 FILE *ofp;
2053 Char path1 [PATH_MAX];
2054 Char path2 [PATH_MAX];
2055 Char path3 [PATH_MAX];
2056 CharPtr rep = "reports";
2057 SeqIdPtr sip;
2058 CharPtr str;
2059
2060 if (sep == NULL || cfp == NULL) return;
2061
2062 fsep = FindNthBioseq (sep, 1);
2063 if (fsep != NULL && fsep->choice == 1) {
2064 bsp = (BioseqPtr) fsep->data.ptrvalue;
2065 if (bsp != NULL) {
2066 for (sip = bsp->id; sip != NULL; sip = sip->next) {
2067 switch (sip->choice) {
2068 case SEQID_GENBANK :
2069 rep = "gbreports";
2070 break;
2071 case SEQID_EMBL :
2072 rep = "ebreports";
2073 break;
2074 case SEQID_DDBJ :
2075 rep = "djreports";
2076 break;
2077 case SEQID_OTHER :
2078 rep = "rfreports";
2079 break;
2080 default :
2081 break;
2082 }
2083 }
2084 }
2085 }
2086
2087 if (cfp->logfp != NULL) {
2088 fprintf (cfp->logfp, "%s\n", cfp->buf);
2089 fflush (cfp->logfp);
2090 }
2091
2092 if (batch == 1) {
2093
2094 TmpNam (path1);
2095 TmpNam (path2);
2096
2097 fp = FileOpen (path1, "w");
2098 if (fp != NULL) {
2099 SeqEntryToGnbk (sep, NULL, GENBANK_FMT, cfp->ffmode, NORMAL_STYLE, 0, 0, 0, NULL, fp);
2100 }
2101 FileClose (fp);
2102 SeriousSeqEntryCleanupBulk (sep);
2103 fp = FileOpen (path2, "w");
2104 if (fp != NULL) {
2105 SeqEntryToGnbk (sep, NULL, GENBANK_FMT, cfp->ffmode, NORMAL_STYLE, 0, 0, 0, NULL, fp);
2106 }
2107 FileClose (fp);
2108
2109 sprintf (cmmd, "%s -o %s -n %s -d %s", cfp->ffdiff, path1, path2, rep);
2110 system (cmmd);
2111
2112 sprintf (cmmd, "rm %s; rm %s", path1, path2);
2113 system (cmmd);
2114
2115 } else if (batch == 2) {
2116
2117 TmpNam (path1);
2118 TmpNam (path2);
2119 TmpNam (path3);
2120
2121 aip = AsnIoOpen (path3, "w");
2122 if (aip == NULL) return;
2123
2124 SeqEntryAsnWrite (sep, aip, NULL);
2125 AsnIoClose (aip);
2126
2127 fp = FileOpen (path1, "w");
2128 if (fp != NULL) {
2129 SeqEntryToGnbk (sep, NULL, GENBANK_FMT, cfp->ffmode, NORMAL_STYLE, 0, 0, 0, NULL, fp);
2130 }
2131 FileClose (fp);
2132
2133 arguments [0] = '\0';
2134 sprintf (arguments,
2135 "-format genbank -mode %s -style normal -view nuc -nocleanup",
2136 ffmod [(int) cfp->ffmode]);
2137
2138 sprintf (cmmd, "%s %s -i %s -o %s", cfp->asn2flat, arguments, path3, path2);
2139 system (cmmd);
2140
2141 sprintf (cmmd, "diff -h %s %s > %s", path1, path2, path3);
2142 diff = system (cmmd);
2143
2144 if (diff > 0) {
2145 fp = FileOpen (path3, "r");
2146 ofp = FileOpen (rep, "a");
2147 if (fp != NULL && ofp != NULL) {
2148 fprintf (ofp, "\n\n%s\n", cfp->buf);
2149 fflush (ofp);
2150 if (FileCacheSetup (&fc, fp)) {
2151 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
2152 while (str != NULL) {
2153 ch = line [0];
2154 if (ch == '<' || ch == '>' || ch == '-') {
2155 fprintf (ofp, "%s\n", line);
2156 } else if (IS_DIGIT (ch)) {
2157 fprintf (ofp, "\n%s\n", "===");
2158 }
2159 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
2160 }
2161 }
2162 }
2163 FileClose (ofp);
2164 FileClose (fp);
2165 }
2166
2167 sprintf (cmmd, "rm %s; rm %s; rm %s", path1, path2, path3);
2168 system (cmmd);
2169 }
2170 #endif
2171 }
2172
2173 static void DoValidatorReport (
2174 SeqEntryPtr sep,
2175 FILE *logfp,
2176 CharPtr id,
2177 CharPtr asnval
2178 )
2179
2180 {
2181 #ifdef OS_UNIX
2182 AsnIoPtr aip;
2183 Char ch;
2184 Char cmmd [512];
2185 int diff;
2186 FileCache fc;
2187 FILE *fp;
2188 Char line [512];
2189 Char path1 [PATH_MAX];
2190 Char path2 [PATH_MAX];
2191 Char path3 [PATH_MAX];
2192 Char path4 [PATH_MAX];
2193 Char path5 [PATH_MAX];
2194 CharPtr str;
2195
2196 if (sep == NULL || logfp == NULL) return;
2197 if (StringHasNoText (id) || StringHasNoText (asnval)) return;
2198
2199 TmpNam (path1);
2200 TmpNam (path2);
2201 TmpNam (path3);
2202 TmpNam (path4);
2203 TmpNam (path5);
2204
2205 RemoveAllNcbiCleanupUserObjects (sep);
2206
2207 aip = AsnIoOpen (path3, "w");
2208 if (aip == NULL) return;
2209
2210 SeqEntryAsnWrite (sep, aip, NULL);
2211 AsnIoClose (aip);
2212
2213 SeriousSeqEntryCleanup (sep, NULL, NULL);
2214 RemoveAllNcbiCleanupUserObjects (sep);
2215
2216 aip = AsnIoOpen (path4, "w");
2217 if (aip == NULL) return;
2218
2219 SeqEntryAsnWrite (sep, aip, NULL);
2220 AsnIoClose (aip);
2221
2222 sprintf (cmmd, "%s -i %s -o stdout -Q 1 -r -l | sort > %s", asnval, path3, path1);
2223 system (cmmd);
2224
2225 sprintf (cmmd, "%s -i %s -o stdout -Q 1 -r -l | sort > %s", asnval, path4, path2);
2226 system (cmmd);
2227
2228 sprintf (cmmd, "diff -h %s %s > %s", path1, path2, path5);
2229 diff = system (cmmd);
2230
2231 if (diff > 0) {
2232 fp = FileOpen (path5, "r");
2233 if (fp != NULL) {
2234 fprintf (logfp, "\n\n%s\n", id);
2235 fflush (logfp);
2236 if (FileCacheSetup (&fc, fp)) {
2237 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
2238 while (str != NULL) {
2239 ch = line [0];
2240 if (ch == '<' || ch == '>' || ch == '-') {
2241 fprintf (logfp, "%s\n", line);
2242 } else if (IS_DIGIT (ch)) {
2243 fprintf (logfp, "\n%s\n", "===");
2244 }
2245 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
2246 }
2247 }
2248 fflush (logfp);
2249 }
2250 FileClose (fp);
2251 }
2252
2253 sprintf (cmmd, "rm %s; rm %s; rm %s; rm %s; rm %s", path1, path2, path3, path4, path5);
2254 system (cmmd);
2255 #endif
2256 }
2257
2258 static void DoModernizeReport (
2259 SeqEntryPtr sep,
2260 CleanFlagPtr cfp
2261 )
2262
2263 {
2264 ByteStorePtr bs = NULL, tmp = NULL;
2265
2266 bs = Se2Bs (sep);
2267
2268 VisitFeaturesInSep (sep, NULL, ModGenes);
2269 tmp = Se2Bs (sep);
2270 if (! BSEqual (bs, tmp)) {
2271 if (cfp->logfp != NULL) {
2272 fprintf (cfp->logfp, "GEN %s\n", cfp->buf);
2273 fflush (cfp->logfp);
2274 }
2275 }
2276 BSFree (bs);
2277 bs = tmp;
2278
2279 VisitFeaturesInSep (sep, NULL, ModRNAs);
2280 tmp = Se2Bs (sep);
2281 if (! BSEqual (bs, tmp)) {
2282 if (cfp->logfp != NULL) {
2283 fprintf (cfp->logfp, "NCR %s\n", cfp->buf);
2284 fflush (cfp->logfp);
2285 }
2286 }
2287 BSFree (bs);
2288 bs = tmp;
2289
2290 VisitBioSourcesInSep (sep, NULL, ModPCRs);
2291 tmp = Se2Bs (sep);
2292 if (! BSEqual (bs, tmp)) {
2293 if (cfp->logfp != NULL) {
2294 fprintf (cfp->logfp, "PCR %s\n", cfp->buf);
2295 fflush (cfp->logfp);
2296 }
2297 }
2298 BSFree (bs);
2299 bs = tmp;
2300
2301 BSFree (bs);
2302 }
2303
2304 static CharPtr stopWords [] = {
2305 "a",
2306 "about",
2307 "again",
2308 "all",
2309 "almost",
2310 "also",
2311 "although",
2312 "always",
2313 "among",
2314 "an",
2315 "and",
2316 "another",
2317 "any",
2318 "are",
2319 "as",
2320 "at",
2321 "be",
2322 "because",
2323 "been",
2324 "before",
2325 "being",
2326 "between",
2327 "both",
2328 "but",
2329 "by",
2330 "can",
2331 "could",
2332 "did",
2333 "do",
2334 "does",
2335 "done",
2336 "due",
2337 "during",
2338 "each",
2339 "either",
2340 "enough",
2341 "especially",
2342 "etc",
2343 "for",
2344 "found",
2345 "from",
2346 "further",
2347 "had",
2348 "has",
2349 "have",
2350 "having",
2351 "here",
2352 "how",
2353 "however",
2354 "i",
2355 "if",
2356 "in",
2357 "into",
2358 "is",
2359 "it",
2360 "its",
2361 "itself",
2362 "just",
2363 "kg",
2364 "km",
2365 "made",
2366 "mainly",
2367 "make",
2368 "may",
2369 "mg",
2370 "might",
2371 "ml",
2372 "mm",
2373 "most",
2374 "mostly",
2375 "must",
2376 "nearly",
2377 "neither",
2378 "no",
2379 "nor",
2380 "obtained",
2381 "of",
2382 "often",
2383 "on",
2384 "our",
2385 "overall",
2386 "perhaps",
2387 "pmid",
2388 "quite",
2389 "rather",
2390 "really",
2391 "regarding",
2392 "seem",
2393 "seen",
2394 "several",
2395 "should",
2396 "show",
2397 "showed",
2398 "shown",
2399 "shows",
2400 "significantly",
2401 "since",
2402 "so",
2403 "some",
2404 "such",
2405 "than",
2406 "that",
2407 "the",
2408 "their",
2409 "theirs",
2410 "them",
2411 "then",
2412 "there",
2413 "therefore",
2414 "these",
2415 "they",
2416 "this",
2417 "those",
2418 "through",
2419 "thus",
2420 "to",
2421 "upon",
2422 "use",
2423 "used",
2424 "using",
2425 "various",
2426 "very",
2427 "was",
2428 "we",
2429 "were",
2430 "what",
2431 "when",
2432 "which",
2433 "while",
2434 "with",
2435 "within",
2436 "without",
2437 "would",
2438 NULL
2439 };
2440
2441 static Boolean IsStopWord (
2442 CharPtr str
2443 )
2444
2445 {
2446 Int2 i;
2447
2448 if (StringHasNoText (str)) return FALSE;
2449
2450 for (i = 0; stopWords [i] != NULL; i++) {
2451 if (StringICmp (str, stopWords [i]) == 0) return TRUE;
2452 }
2453
2454 return FALSE;
2455 }
2456
2457 static ValNodePtr GetAuthorMLNameList (
2458 AuthListPtr alp
2459 )
2460
2461 {
2462 AuthorPtr ap;
2463 Char buf [128];
2464 Char ch;
2465 Char chr [4];
2466 ValNodePtr head = NULL;
2467 Char initials [32];
2468 ValNodePtr last = NULL;
2469 NameStdPtr nsp;
2470 PersonIdPtr pid;
2471 CharPtr ptr;
2472 CharPtr str;
2473 ValNodePtr tmp;
2474 ValNodePtr vnp;
2475
2476 if (alp == NULL) return NULL;
2477
2478 for (vnp = alp->names; vnp != NULL; vnp = vnp->next) {
2479 buf [0] = '\0';
2480 initials [0] = '\0';
2481 switch (alp->choice) {
2482 case 1 :
2483 ap = (AuthorPtr) vnp->data.ptrvalue;
2484 if (ap == NULL) continue;
2485 pid = ap->name;
2486 if (pid == NULL) continue;
2487 if (pid->choice == 2) {
2488 nsp = pid->data;
2489 if (nsp == NULL) continue;
2490 str = nsp->names [0];
2491 if (StringHasNoText (str)) continue;
2492 StringNCpy_0 (buf, str, sizeof (buf));
2493 StringNCpy_0 (initials, nsp->names [4], sizeof (initials));
2494 }
2495 break;
2496 case 2 :
2497 case 3 :
2498 str = (CharPtr) vnp->data.ptrvalue;
2499 if (StringHasNoText (str)) continue;
2500 StringNCpy_0 (buf, str, sizeof (buf));
2501 ptr = StringChr (buf, ',');
2502 if (ptr == NULL) {
2503 ptr = StringChr (buf, ' ');
2504 }
2505 if (ptr != NULL) {
2506 *ptr = '\0';
2507 ptr++;
2508 StringNCpy_0 (initials, ptr, sizeof (initials));
2509 }
2510 break;
2511 default :
2512 break;
2513 }
2514 if (StringHasNoText (buf)) continue;
2515 if (StringDoesHaveText (initials)) {
2516 StringCat (buf, " ");
2517 chr [1] = '\0';
2518 ptr = initials;
2519 ch = *ptr;
2520 while (ch != '\0') {
2521 if (ch != ' ' && ch != '.' && ch != ',') {
2522 chr [0] = ch;
2523 StringCat (buf, chr);
2524 }
2525 ptr++;
2526 ch = *ptr;
2527 }
2528 }
2529 TrimSpacesAroundString (buf);
2530 tmp = ValNodeCopyStr (&last, 0, buf);
2531 if (head == NULL) {
2532 head = tmp;
2533 }
2534 last = tmp;
2535 }
2536
2537 return head;
2538 }
2539
2540 static ValNodePtr GetTitleWords (
2541 CharPtr title
2542 )
2543
2544 {
2545 Char ch;
2546 Boolean goOn = TRUE;
2547 ValNodePtr head = NULL;
2548 ValNodePtr last = NULL;
2549 CharPtr ptr;
2550 CharPtr str;
2551 CharPtr tmp;
2552 ValNodePtr vnp;
2553
2554 if (StringHasNoText (title)) return NULL;
2555
2556 tmp = StringSave (title);
2557 if (tmp == NULL) return NULL;
2558
2559 ptr = tmp;
2560 ch = *ptr;
2561 if (ch == '\0') {
2562 goOn = FALSE;
2563 }
2564 while (goOn) {
2565 while (ch != '\0' && (! IS_ALPHANUM (ch))) {
2566 ptr++;
2567 ch = *ptr;
2568 }
2569 str = ptr;
2570 while (ch != '\0' && IS_ALPHANUM (ch)) {
2571 ptr++;
2572 ch = *ptr;
2573 }
2574 if (ch == '\0') {
2575 goOn = FALSE;
2576 }
2577 *ptr = '\0';
2578 ptr++;
2579 ch = *ptr;
2580 TrimSpacesAroundString (str);
2581 /*
2582 if (! IsStopWord (str)) {
2583 vnp = ValNodeCopyStr (&last, 0, str);
2584 if (head == NULL) {
2585 head = vnp;
2586 }
2587 last = vnp;
2588 }
2589 */
2590 vnp = ValNodeCopyStr (&last, 0, str);
2591 if (head == NULL) {
2592 head = vnp;
2593 }
2594 last = vnp;
2595 }
2596
2597 MemFree (tmp);
2598
2599 return head;
2600 }
2601
2602 static ValNodePtr DuplicateStringList (
2603 ValNodePtr list
2604 )
2605
2606 {
2607 ValNodePtr head = NULL;
2608 ValNodePtr last = NULL;
2609 CharPtr str;
2610 ValNodePtr tmp;
2611 ValNodePtr vnp;
2612
2613 if (list == NULL) return NULL;
2614
2615 for (vnp = list; vnp != NULL; vnp = vnp->next) {
2616 str = (CharPtr) vnp->data.ptrvalue;
2617 if (StringHasNoText (str)) continue;
2618 tmp = ValNodeCopyStr (&last, 0, str);
2619 if (head == NULL) {
2620 head = tmp;
2621 }
2622 last = tmp;
2623 }
2624
2625 return head;
2626 }
2627
2628 typedef enum {
2629 FULL_INITIALS,
2630 TWO_INITIALS,
2631 ONE_INITIAL,
2632 NO_INITIALS
2633 } InitialsPolicy;
2634
2635 static void TrimInitials (
2636 CharPtr auth,
2637 InitialsPolicy initials
2638 )
2639
2640 {
2641 Char ch;
2642 CharPtr ptr;
2643
2644 if (StringHasNoText (auth)) return;
2645
2646 switch (initials) {
2647 case FULL_INITIALS :
2648 break;
2649 case TWO_INITIALS :
2650 ptr = StringRChr (auth, ' ');
2651 if (ptr != NULL) {
2652 ptr++;
2653 ch = *ptr;
2654 if (IS_ALPHANUM (ch)) {
2655 ptr++;
2656 ch = *ptr;
2657 if (IS_ALPHANUM (ch)) {
2658 ptr++;
2659 *ptr = '\0';
2660 }
2661 }
2662 }
2663 break;
2664 case ONE_INITIAL :
2665 ptr = StringRChr (auth, ' ');
2666 if (ptr != NULL) {
2667 ptr++;
2668 ch = *ptr;
2669 if (IS_ALPHANUM (ch)) {
2670 ptr++;
2671 *ptr = '\0';
2672 }
2673 }
2674 break;
2675 case NO_INITIALS :
2676 ptr = StringRChr (auth, ' ');
2677 if (ptr != NULL) {
2678 *ptr = '\0';
2679 }
2680 break;
2681 default :
2682 break;
2683 }
2684 }
2685
2686 static Int4 DoUnpubBooleanQuery (
2687 ValNodePtr authors,
2688 InitialsPolicy initials,
2689 Boolean firstLastOnly,
2690 ValNodePtr titlewords,
2691 Int2 year,
2692 Boolean expand,
2693 Uint4Ptr uidp
2694 )
2695
2696 {
2697 Boolean addOpAnd = FALSE;
2698 Char buf [128];
2699 Int4 count = 0;
2700 Entrez2BooleanReplyPtr e2br;
2701 Entrez2IdListPtr e2id;
2702 Entrez2RequestPtr e2rp = NULL;
2703 Entrez2ReplyPtr e2ry;
2704 CharPtr str;
2705 ValNodePtr vnp;
2706
2707 if (uidp != NULL) {
2708 *uidp = 0;
2709 }
2710
2711 e2rp = EntrezCreateBooleanRequest (TRUE, FALSE, "PubMed", NULL, 0, 0, NULL, 20, 0);
2712 if (e2rp == NULL) return 0;
2713
2714 for (vnp = authors; vnp != NULL; vnp = vnp->next) {
2715 str = (CharPtr) vnp->data.ptrvalue;
2716 if (StringHasNoText (str)) continue;
2717 if (firstLastOnly) {
2718 if (vnp != authors && vnp->next != NULL) continue;
2719 }
2720 StringNCpy_0 (buf, str, sizeof (buf));
2721 switch (initials) {
2722 case TWO_INITIALS :
2723 TrimInitials (buf, TWO_INITIALS);
2724 break;
2725 case ONE_INITIAL :
2726 TrimInitials (buf, ONE_INITIAL);
2727 break;
2728 case NO_INITIALS :
2729 TrimInitials (buf, NO_INITIALS);
2730 break;
2731 default :
2732 break;
2733 }
2734 if (addOpAnd) {
2735 EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2736 }
2737 EntrezAddToBooleanRequest (e2rp, NULL, 0, "AUTH", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2738 addOpAnd = TRUE;
2739 }
2740
2741 for (vnp = titlewords; vnp != NULL; vnp = vnp->next) {
2742 str = (CharPtr) vnp->data.ptrvalue;
2743 if (StringHasNoText (str)) continue;
2744 if (IsStopWord (str)) continue;
2745 StringNCpy_0 (buf, str, sizeof (buf));
2746 if (addOpAnd) {
2747 EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2748 }
2749 EntrezAddToBooleanRequest (e2rp, NULL, 0, "TITL", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2750 addOpAnd = TRUE;
2751 }
2752
2753 if (year > 0) {
2754 if (addOpAnd) {
2755 EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2756 }
2757 if (expand) {
2758 EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2759 sprintf (buf, "%d", (int) year - 1);
2760 EntrezAddToBooleanRequest (e2rp, NULL, 0, "EDAT", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2761 EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2762 sprintf (buf, "%d", (int) year);
2763 EntrezAddToBooleanRequest (e2rp, NULL, 0, "EDAT", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2764 EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2765 sprintf (buf, "%d", (int) year + 1);
2766 EntrezAddToBooleanRequest (e2rp, NULL, 0, "EDAT", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2767 EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2768 } else {
2769 sprintf (buf, "%d", (int) year);
2770 EntrezAddToBooleanRequest (e2rp, NULL, 0, "EDAT", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2771 }
2772 addOpAnd = TRUE;
2773 }
2774
2775 e2ry = EntrezSynchronousQuery (e2rp);
2776
2777 e2rp = Entrez2RequestFree (e2rp);
2778 if (e2ry == NULL) return 0;
2779 e2br = EntrezExtractBooleanReply (e2ry);
2780 if (e2br == NULL) return 0;
2781
2782 count = e2br->count;
2783
2784 if (count > 0 && uidp != NULL) {
2785 e2id = e2br->uids;
2786 if (e2id != NULL && e2id->num == 1 && e2id->uids != NULL) {
2787 BSSeek (e2id->uids, 0, SEEK_SET);
2788 *uidp = Nlm_BSGetUint4 (e2id->uids);
2789 }
2790 }
2791
2792 Entrez2BooleanReplyFree (e2br);
2793
2794 return count;
2795 }
2796
2797 static CharPtr GetBestJournal (
2798 ValNodePtr journaltitle
2799 )
2800
2801 {
2802 CharPtr str;
2803 ValNodePtr vnp;
2804
2805 if (journaltitle == NULL) return NULL;
2806
2807 for (vnp = journaltitle; vnp != NULL; vnp = vnp->next) {
2808 if (vnp->choice == Cit_title_iso_jta) {
2809 str = (CharPtr) vnp->data.ptrvalue;
2810 if (StringHasNoText (str)) continue;
2811 return str;
2812 }
2813 }
2814
2815 for (vnp = journaltitle; vnp != NULL; vnp = vnp->next) {
2816 if (vnp->choice == Cit_title_name || vnp->choice == Cit_title_jta) {
2817 str = (CharPtr) vnp->data.ptrvalue;
2818 if (StringHasNoText (str)) continue;
2819 return str;
2820 }
2821 }
2822
2823 return NULL;
2824 }
2825
2826 typedef struct pubref {
2827 ValNodePtr authors;
2828 ValNodePtr titlewords;
2829 CharPtr fulltitle;
2830 CharPtr uniquestr;
2831 CharPtr journal;
2832 ImprintPtr imp;
2833 Int2 year;
2834 Uint4 pmid;
2835 } PubRef, PNTR PubRefPtr;
2836
2837 static void PrintPubAuthors (
2838 CleanFlagPtr cfp,
2839 PubRefPtr prp
2840 )
2841
2842 {
2843 CharPtr prefix = "";
2844 CharPtr str;
2845 ValNodePtr vnp;
2846
2847 if (cfp == NULL || cfp->logfp == NULL || prp == NULL) return;
2848
2849 for (vnp = prp->authors; vnp != NULL; vnp = vnp->next) {
2850 str = (CharPtr) vnp->data.ptrvalue;
2851 if (StringHasNoText (str)) continue;
2852 fprintf (cfp->logfp, "%s%s", prefix, str);
2853 prefix = ", ";
2854 }
2855 }
2856
2857 static void PrintPubTitle (
2858 CleanFlagPtr cfp,
2859 PubRefPtr prp
2860 )
2861
2862 {
2863 CharPtr prefix = "";
2864 CharPtr str;
2865 ValNodePtr vnp;
2866
2867 if (cfp == NULL || cfp->logfp == NULL || prp == NULL) return;
2868
2869 if (StringDoesHaveText (prp->fulltitle)) {
2870 fprintf (cfp->logfp, "%s%s", prefix, prp->fulltitle);
2871 } else {
2872 for (vnp = prp->titlewords; vnp != NULL; vnp = vnp->next) {
2873 str = (CharPtr) vnp->data.ptrvalue;
2874 if (StringHasNoText (str)) continue;
2875 fprintf (cfp->logfp, "%s%s", prefix, str);
2876 prefix = " ";
2877 }
2878 }
2879 }
2880
2881 static void PrintPubJournal (
2882 CleanFlagPtr cfp,
2883 PubRefPtr prp
2884 )
2885
2886 {
2887 DatePtr dp = NULL;
2888 ImprintPtr imp;
2889 CharPtr prefix = "";
2890 Int2 year;
2891
2892 if (cfp == NULL || cfp->logfp == NULL || prp == NULL) return;
2893
2894 if (StringHasNoText (prp->journal) && prp->imp == NULL) {
2895 fprintf (cfp->logfp, "Unpublished");
2896 prefix = " ";
2897 if (prp->year > 0) {
2898 fprintf (cfp->logfp, "%s[%d]", prefix, (int) prp->year);
2899 prefix = " ";
2900 }
2901 return;
2902 }
2903
2904 if (StringDoesHaveText (prp->journal)) {
2905 fprintf (cfp->logfp, "%s%s", prefix, prp->journal);
2906 prefix = " ";
2907 }
2908
2909 imp = prp->imp;
2910 if (imp != NULL) {
2911 dp = imp->date;
2912 if (dp != NULL && dp->data [0] == 1) {
2913 year = (Int2) dp->data [1] + 1900;
2914 fprintf (cfp->logfp, "%s[%d]", prefix, (int) year);
2915 prefix = " ";
2916 }
2917 if (StringDoesHaveText (imp->volume)) {
2918 fprintf (cfp->logfp, "%s%s", prefix, imp->volume);
2919 prefix = " ";
2920 }
2921 /*
2922 if (StringDoesHaveText (imp->issue)) {
2923 fprintf (cfp->logfp, "%s(%s)", prefix, imp->issue);
2924 prefix = " ";
2925 }
2926 */
2927 if (StringDoesHaveText (imp->pages)) {
2928 fprintf (cfp->logfp, "%s: %s", prefix, imp->pages);
2929 prefix = " ";
2930 }
2931 }
2932
2933 if (prp->pmid > 0) {
2934 fprintf (cfp->logfp, "%s<%ld>", prefix, (long) prp->pmid);
2935 prefix = " ";
2936 }
2937 }
2938
2939 /*
2940 static void PrintAuthTitle (
2941 CleanFlagPtr cfp,
2942 CharPtr label,
2943 Boolean multiline,
2944 PubRefPtr prp
2945 )
2946
2947 {
2948 DatePtr dp = NULL;
2949 ImprintPtr imp;
2950 CharPtr prefix;
2951 CharPtr separator = " ";
2952 CharPtr str;
2953 ValNodePtr vnp;
2954 Int2 year;
2955
2956 if (cfp == NULL || cfp->logfp == NULL || prp == NULL) return;
2957
2958 if (multiline) {
2959 separator = "\n";
2960 }
2961
2962 if (StringDoesHaveText (label)) {
2963 fprintf (cfp->logfp, "%s", label);
2964 fprintf (cfp->logfp, separator);
2965 }
2966
2967 prefix = "Author: ";
2968 for (vnp = prp->authors; vnp != NULL; vnp = vnp->next) {
2969 str = (CharPtr) vnp->data.ptrvalue;
2970 if (StringHasNoText (str)) continue;
2971 fprintf (cfp->logfp, "%s%s", prefix, str);
2972 prefix = ", ";
2973 }
2974 fprintf (cfp->logfp, separator);
2975 prefix = "Title: ";
2976 if (StringDoesHaveText (prp->fulltitle)) {
2977 fprintf (cfp->logfp, "%s%s", prefix, prp->fulltitle);
2978 } else {
2979 for (vnp = prp->titlewords; vnp != NULL; vnp = vnp->next) {
2980 str = (CharPtr) vnp->data.ptrvalue;
2981 if (StringHasNoText (str)) continue;
2982 fprintf (cfp->logfp, "%s%s", prefix, str);
2983 prefix = " ";
2984 }
2985 }
2986 fprintf (cfp->logfp, separator);
2987 year = prp->year;
2988 if (year > 0) {
2989 fprintf (cfp->logfp, "Year: %d", (int) year);
2990 fprintf (cfp->logfp, separator);
2991 }
2992
2993 if (StringDoesHaveText (prp->journal)) {
2994 fprintf (cfp->logfp, "Journal: %s", prp->journal);
2995 imp = prp->imp;
2996 if (imp != NULL) {
2997 dp = imp->date;
2998 if (dp != NULL && dp->data [0] == 1) {
2999 year = (Int2) dp->data [1] + 1900;
3000 fprintf (cfp->logfp, ". [%d]", (int) year);
3001 }
3002 if (StringDoesHaveText (imp->volume)) {
3003 fprintf (cfp->logfp, ". %s", imp->volume);
3004 }
3005 if (StringDoesHaveText (imp->issue)) {
3006 fprintf (cfp->logfp, " (%s)", imp->issue);
3007 }
3008 if (StringDoesHaveText (imp->pages)) {
3009 fprintf (cfp->logfp, " : %s", imp->pages);
3010 }
3011 }
3012 fprintf (cfp->logfp, separator);
3013 }
3014
3015 fprintf (cfp->logfp, "\n");
3016 fflush (cfp->logfp);
3017 }
3018 */
3019
3020 typedef enum {
3021 NO_NAME_MATCH,
3022 LAST_NAME_MATCH,
3023 ONE_INIT_MATCH,
3024 TWO_INIT_MATCH,
3025 FULL_NAME_MATCH
3026 } AuthComp;
3027
3028 static CharPtr authlabel [] = {
3029 "AUTH_MISMATCH", "LAST_NAMES", "ONE_INIT", "TWO_INITS", "FULL_NAMES"
3030 };
3031
3032 static AuthComp AuthorCompare (
3033 CharPtr auth1,
3034 CharPtr auth2
3035 )
3036
3037 {
3038 Char buf1 [128];
3039 Char buf2 [128];
3040
3041 if (StringHasNoText (auth1) || StringHasNoText (auth2)) return NO_NAME_MATCH;
3042
3043 StringNCpy_0 (buf1, auth1, sizeof (buf1));
3044 StringNCpy_0 (buf2, auth2, sizeof (buf2));
3045
3046 if (StringICmp (buf1, buf2) == 0) return FULL_NAME_MATCH;
3047
3048 TrimInitials (buf1, TWO_INITIALS);
3049 TrimInitials (buf2, TWO_INITIALS);
3050
3051 if (StringICmp (buf1, buf2) == 0) return TWO_INIT_MATCH;
3052
3053 TrimInitials (buf1, ONE_INITIAL);
3054 TrimInitials (buf2, ONE_INITIAL);
3055
3056 if (StringICmp (buf1, buf2) == 0) return ONE_INIT_MATCH;
3057
3058 TrimInitials (buf1, NO_INITIALS);
3059 TrimInitials (buf2, NO_INITIALS);
3060
3061 if (StringICmp (buf1, buf2) == 0) return LAST_NAME_MATCH;
3062
3063 return NO_NAME_MATCH;
3064 }
3065
3066 static AuthComp AuthorsIdentical (
3067 ValNodePtr oldauthors,
3068 ValNodePtr newauthors
3069 )
3070
3071 {
3072 AuthComp curr, rsult = FULL_NAME_MATCH;
3073 CharPtr str1, str2;
3074 ValNodePtr vnp1, vnp2;
3075
3076 if (oldauthors == NULL || newauthors == NULL) return NO_NAME_MATCH;
3077
3078 for (vnp1 = oldauthors, vnp2 = newauthors;
3079 vnp1 != NULL && vnp2 != NULL;
3080 vnp1 = vnp1->next, vnp2 = vnp2->next) {
3081 str1 = (CharPtr) vnp1->data.ptrvalue;
3082 str2 = (CharPtr) vnp2->data.ptrvalue;
3083 curr = AuthorCompare (str1, str2);
3084 if (curr == NO_NAME_MATCH) return NO_NAME_MATCH;
3085 if (curr < rsult) {
3086 rsult = curr;
3087 }
3088 }
3089
3090 if (vnp1 != NULL || vnp2 != NULL) return NO_NAME_MATCH;
3091
3092 return rsult;
3093 }
3094
3095 static AuthComp AuthorInList (
3096 CharPtr author,
3097 ValNodePtr newauthors
3098 )
3099
3100 {
3101 AuthComp curr, rsult = FULL_NAME_MATCH;
3102 CharPtr str;
3103 ValNodePtr vnp;
3104
3105 if (StringHasNoText (author) || newauthors == NULL) return NO_NAME_MATCH;
3106
3107 for (vnp = newauthors; vnp != NULL; vnp = vnp->next) {
3108 str = (CharPtr) vnp->data.ptrvalue;
3109 curr = AuthorCompare (author, str);
3110 if (curr == NO_NAME_MATCH) continue;
3111 if (curr < rsult) {
3112 rsult = curr;
3113 }
3114 }
3115
3116 return rsult;
3117 }
3118
3119 static Boolean WordInList (
3120 CharPtr word,
3121 ValNodePtr newtitlewords
3122 )
3123
3124 {
3125 CharPtr str;
3126 ValNodePtr vnp;
3127
3128 if (StringHasNoText (word) || newtitlewords == NULL) return NO_NAME_MATCH;
3129
3130 for (vnp = newtitlewords; vnp != NULL; vnp = vnp->next) {
3131 str = (CharPtr) vnp->data.ptrvalue;
3132 if (StringHasNoText (str)) continue;
3133 if (StringICmp (word, str) == 0) return TRUE;
3134 }
3135
3136 return FALSE;
3137 }
3138
3139 static void PrintComparison (
3140 CleanFlagPtr cfp,
3141 PubRefPtr oldprp,
3142 PubRefPtr newprp
3143 )
3144
3145 {
3146 AuthComp authcomp, curr, best = FULL_NAME_MATCH;
3147 Int2 matches, total;
3148 CharPtr str, str1, str2;
3149 Boolean titlsame;
3150 ValNodePtr vnp;
3151
3152 if (cfp == NULL || cfp->logfp == NULL || oldprp == NULL || newprp == NULL) return;
3153
3154 authcomp = AuthorsIdentical (oldprp->authors, newprp->authors);
3155 titlsame = (Boolean) (StringICmp (oldprp->fulltitle, newprp->fulltitle) == 0);
3156
3157 fprintf (cfp->logfp, "PMID %ld", (long) newprp->pmid);
3158 fprintf (cfp->logfp, "\t");
3159
3160 fprintf (cfp->logfp, "%s", cfp->buf);
3161 fprintf (cfp->logfp, "\t");
3162
3163 fprintf (cfp->logfp, "REF_COUNT %ld", (long) cfp->unpubcount);
3164 fprintf (cfp->logfp, "\t");
3165
3166 fprintf (cfp->logfp, "ORIG_NAMES %ld", (long) ValNodeLen (oldprp->authors));
3167 fprintf (cfp->logfp, "\t");
3168
3169 fprintf (cfp->logfp, "ADDL_NAMES %ld", (long) (ValNodeLen (newprp->authors) - ValNodeLen (oldprp->authors)));
3170 fprintf (cfp->logfp, "\t");
3171
3172 fprintf (cfp->logfp, "ORIG_WORDS %ld", (long) ValNodeLen (oldprp->titlewords));
3173 fprintf (cfp->logfp, "\t");
3174
3175 fprintf (cfp->logfp, "ADDL_WORDS %ld", (long) (ValNodeLen (newprp->titlewords) - ValNodeLen (oldprp->titlewords)));
3176 fprintf (cfp->logfp, "\t");
3177
3178 if (StringDoesHaveText (oldprp->uniquestr)) {
3179 fprintf (cfp->logfp, "UNIQ_CIT %s", oldprp->uniquestr);
3180 } else {
3181 fprintf (cfp->logfp, "?");
3182 }
3183 fprintf (cfp->logfp, "\t");
3184
3185 if (authcomp != NO_NAME_MATCH) {
3186 str = authlabel [(int) authcomp];
3187 fprintf (cfp->logfp, "AUTHORS_SAME [%s]", str);
3188 fprintf (cfp->logfp, "\t");
3189 } else {
3190 total = ValNodeLen (newprp->authors);
3191
3192 matches = 0;
3193 for (vnp = oldprp->authors; vnp != NULL; vnp = vnp->next) {
3194 str = (CharPtr) vnp->data.ptrvalue;
3195 if (StringHasNoText (str)) continue;
3196 curr = AuthorInList (str, newprp->authors);
3197 if (curr == NO_NAME_MATCH) continue;
3198 matches++;
3199 if (curr < best) {
3200 best = curr;
3201 }
3202 }
3203
3204 str = authlabel [(int) best];
3205 fprintf (cfp->logfp, "AUTHORS_DIFFER [%s] %d / %d", str, (int) matches, (int) total);
3206 fprintf (cfp->logfp, "\t");
3207 }
3208
3209 if (titlsame) {
3210 fprintf (cfp->logfp, "TITLE_SAME");
3211 } else {
3212 total = 0;
3213 for (vnp = newprp->titlewords; vnp != NULL; vnp = vnp->next) {
3214 str = (CharPtr) vnp->data.ptrvalue;
3215 if (StringHasNoText (str)) continue;
3216 if (IsStopWord (str)) continue;
3217 total++;
3218 }
3219
3220 matches = 0;
3221 for (vnp = oldprp->titlewords; vnp != NULL; vnp = vnp->next) {
3222 str = (CharPtr) vnp->data.ptrvalue;
3223 if (StringHasNoText (str)) continue;
3224 if (IsStopWord (str)) continue;
3225 if (! WordInList (str, newprp->titlewords)) continue;
3226 matches++;
3227 }
3228
3229 str1 = NULL;
3230 str2 = NULL;
3231 vnp = oldprp->titlewords;
3232 if (vnp != NULL) {
3233 str1 = (CharPtr) vnp->data.ptrvalue;
3234 }
3235 vnp = newprp->titlewords;
3236 if (vnp != NULL) {
3237 str2 = (CharPtr) vnp->data.ptrvalue;
3238 }
3239 if (str1 != NULL && str2 != NULL && StringCmp (str1, str2) == 0 && total > 0 && matches == total) {
3240 fprintf (cfp->logfp, "TITLE_SIMILAR %d / %d", (int) matches, (int) total);
3241 } else if (total > 0 && matches == total) {
3242 fprintf (cfp->logfp, "TITLE_SUSPECT %d / %d", (int) matches, (int) total);
3243 } else {
3244 fprintf (cfp->logfp, "TITLE_DIFFERS %d / %d", (int) matches, (int) total);
3245 }
3246 }
3247 fprintf (cfp->logfp, "\t");
3248
3249 PrintPubAuthors (cfp, oldprp);
3250 fprintf (cfp->logfp, "\t");
3251
3252 PrintPubAuthors (cfp, newprp);
3253 fprintf (cfp->logfp, "\t");
3254
3255 PrintPubTitle (cfp, oldprp);
3256 fprintf (cfp->logfp, "\t");
3257
3258 PrintPubTitle (cfp, newprp);
3259 fprintf (cfp->logfp, "\t");
3260
3261 PrintPubJournal (cfp, oldprp);
3262 fprintf (cfp->logfp, "\t");
3263
3264 PrintPubJournal (cfp, newprp);
3265
3266 fprintf (cfp->logfp, "\n");
3267
3268 /*
3269 if (identical) {
3270 PrintAuthTitle (cfp, "EXACT: ", FALSE, newprp);
3271 } else {
3272 PrintAuthTitle (cfp, "BEFORE: ", FALSE, oldprp);
3273 PrintAuthTitle (cfp, "AFTER: ", FALSE, newprp);
3274 }
3275 */
3276 }
3277
3278 static void StrStripSpaces (
3279 CharPtr str
3280 )
3281
3282 {
3283 CharPtr new_str;
3284
3285 if (str == NULL) return;
3286
3287 new_str = str;
3288 while (*str != '\0') {
3289 *new_str++ = *str;
3290 if (*str == ' ' || *str == '\t' || *str == '(') {
3291 for (str++; *str == ' ' || *str == '\t'; str++) continue;
3292 if (*str == ')' || *str == ',') {
3293 new_str--;
3294 }
3295 } else {
3296 str++;
3297 }
3298 }
3299 *new_str = '\0';
3300 }
3301
3302 static void StrStripBrackets (
3303 CharPtr str
3304 )
3305
3306 {
3307 size_t len;
3308
3309 if (str == NULL) return;
3310
3311 len = StringLen (str);
3312 if (len < 2) return;
3313
3314 if (str [0] == '[') {
3315 str [0] = ' ';
3316 }
3317
3318 if (str [len - 1] == ']') {
3319 str [len - 1] = ' ';
3320 }
3321 }
3322
3323 static void PrintPubMedCit (
3324 CleanFlagPtr cfp,
3325 Uint4 pmid,
3326 PubRefPtr oldprp
3327 )
3328
3329 {
3330 CitArtPtr cap;
3331 CitJourPtr cjp;
3332 DatePtr dp;
3333 ImprintPtr imp;
3334 MedlineEntryPtr mep;
3335 PubmedEntryPtr pep;
3336 PubRef pr;
3337 CharPtr str;
3338 CharPtr tmp;
3339 ValNodePtr vnp;
3340
3341 if (cfp == NULL || cfp->logfp == NULL || pmid < 1) return;
3342
3343 MemSet ((Pointer) &pr, 0, sizeof (PubRef));
3344
3345 pep = PubMedSynchronousQuery (pmid);
3346 if (pep == NULL) return;
3347
3348 mep = (MedlineEntryPtr) pep->medent;
3349 if (mep != NULL && mep->cit != NULL) {
3350 cap = mep->cit;
3351 if (cap != NULL) {
3352 pr.authors = GetAuthorMLNameList (cap->authors);
3353 for (vnp = cap->title; vnp != NULL; vnp = vnp->next) {
3354 if (vnp->choice == Cit_title_name) {
3355 str = (CharPtr) vnp->data.ptrvalue;
3356 if (StringHasNoText (str)) continue;
3357 pr.titlewords = GetTitleWords (str);
3358 tmp = StringSave (str);
3359 TrimSpacesAndJunkFromEnds (tmp, TRUE);
3360 s_RemovePeriodFromEnd (tmp);
3361 StrStripBrackets (tmp);
3362 StrStripSpaces (tmp);
3363 pr.fulltitle = tmp;
3364 }
3365 }
3366 if (cap->from == 1) {
3367 cjp = (CitJourPtr) cap->fromptr;
3368 if (cjp != NULL) {
3369 pr.journal = GetBestJournal (cjp->title);
3370 imp = cjp->imp;
3371 pr.imp = imp;
3372 if (imp != NULL) {
3373 dp = imp->date;
3374 if (dp != NULL && dp->data [0] == 1) {
3375 pr.year = (Int2) dp->data [1] + 1900;
3376 }
3377 }
3378 }
3379 }
3380 pr.pmid = pmid;
3381 if (pr.authors != NULL && pr.titlewords != NULL) {
3382 /*
3383 PrintAuthTitle (cfp, "RESULT", TRUE, &pr);
3384 */
3385 PrintComparison (cfp, oldprp, &pr);
3386 }
3387 ValNodeFreeData (pr.authors);
3388 ValNodeFreeData (pr.titlewords);
3389 MemFree (pr.fulltitle);
3390 }
3391 }
3392
3393 pep = PubmedEntryFree (pep);
3394 }
3395
3396 static void TryEntrezQueries (
3397 CleanFlagPtr cfp,
3398 PubRefPtr prp
3399 )
3400
3401 {
3402 Int4 count;
3403 Uint4 pmid = 0;
3404
3405 if (cfp == NULL || cfp->logfp == NULL || prp == NULL || prp->authors == NULL) return;
3406
3407 count = DoUnpubBooleanQuery (prp->authors, ONE_INITIAL, FALSE, prp->titlewords, prp->year, TRUE, &pmid);
3408
3409 if (count > 1) {
3410 count = DoUnpubBooleanQuery (prp->authors, TWO_INITIALS, FALSE, prp->titlewords, prp->year, TRUE, &pmid);
3411 }
3412
3413 if (count < 1) {
3414
3415 fprintf (cfp->logfp, "0\t%s\tUNPUB\t", cfp->buf);
3416 PrintPubAuthors (cfp, prp);
3417 fprintf (cfp->logfp, "\t");
3418 PrintPubTitle (cfp, prp);
3419 fprintf (cfp->logfp, "\t");
3420 PrintPubJournal (cfp, prp);
3421 fprintf (cfp->logfp, "\n");
3422
3423 } else if (count > 1) {
3424
3425 fprintf (cfp->logfp, "0\t%s\tCOUNT %ld\t", cfp->buf, (long) count);
3426 PrintPubAuthors (cfp, prp);
3427 fprintf (cfp->logfp, "\t");
3428 PrintPubTitle (cfp, prp);
3429 fprintf (cfp->logfp, "\t");
3430 PrintPubJournal (cfp, prp);
3431 fprintf (cfp->logfp, "\n");
3432
3433 } else {
3434
3435 PrintPubMedCit (cfp, pmid, prp);
3436 }
3437
3438 fflush (cfp->logfp);
3439 }
3440
3441 static void CountUnpubPub (
3442 PubdescPtr pdp,
3443 Pointer userdata
3444 )
3445
3446 {
3447 CleanFlagPtr cfp;
3448 CitGenPtr cgp = NULL;
3449 Boolean hasUnpublished = FALSE;
3450 ValNodePtr vnp;
3451
3452 if (pdp == NULL) return;
3453 cfp = (CleanFlagPtr) userdata;
3454 if (cfp == NULL) return;
3455
3456 for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3457 if (vnp->choice == PUB_Gen) {
3458 cgp = (CitGenPtr) vnp->data.ptrvalue;
3459 if (cgp != NULL) {
3460 if (StringICmp (cgp->cit, "Unpublished") == 0) {
3461 if (StringICmp (cgp->title, "Direct Submission") != 0) {
3462 hasUnpublished = TRUE;
3463 }
3464 }
3465 }
3466 } else if (vnp->choice == PUB_Muid || vnp->choice == PUB_PMid) {
3467 return;
3468 } else if (vnp->choice == PUB_Article || vnp->choice == PUB_Book || vnp->choice == PUB_Man) {
3469 return;
3470 }
3471 }
3472
3473 if (! hasUnpublished) return;
3474 if (cgp == NULL) return;
3475
3476 (cfp->unpubcount)++;
3477 }
3478
3479 static void ProcessUnpubPub (
3480 PubdescPtr pdp,
3481 Pointer userdata
3482 )
3483
3484 {
3485 Char buf [521];
3486 CleanFlagPtr cfp;
3487 CitGenPtr cgp = NULL;
3488 DatePtr dp = NULL;
3489 Boolean hasUnpublished = FALSE;
3490 PubRef pr;
3491 CharPtr tmp;
3492 ValNodePtr vnp, vnpcgp = NULL;
3493 Int2 year = 0;
3494
3495 if (pdp == NULL) return;
3496 cfp = (CleanFlagPtr) userdata;
3497 if (cfp == NULL) return;
3498
3499 for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3500 if (vnp->choice == PUB_Gen) {
3501 cgp = (CitGenPtr) vnp->data.ptrvalue;
3502 if (cgp != NULL) {
3503 if (StringICmp (cgp->cit, "Unpublished") == 0) {
3504 if (StringICmp (cgp->title, "Direct Submission") != 0) {
3505 hasUnpublished = TRUE;
3506 vnpcgp = vnp;
3507 }
3508 }
3509 }
3510 } else if (vnp->choice == PUB_Muid || vnp->choice == PUB_PMid) {
3511 return;
3512 } else if (vnp->choice == PUB_Article || vnp->choice == PUB_Book || vnp->choice == PUB_Man) {
3513 return;
3514 }
3515 }
3516
3517 if (! hasUnpublished) return;
3518 if (cgp == NULL) return;
3519
3520 MemSet ((Pointer) &pr, 0, sizeof (PubRef));
3521
3522 pr.authors = GetAuthorMLNameList (cgp->authors);
3523 pr.titlewords = GetTitleWords (cgp->title);
3524 if (vnpcgp != NULL) {
3525 if (PubLabelUnique (vnpcgp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) {
3526 pr.uniquestr = StringSaveNoNull (buf);
3527 }
3528 }
3529
3530 tmp = StringSave (cgp->title);
3531 TrimSpacesAndJunkFromEnds (tmp, TRUE);
3532 s_RemovePeriodFromEnd (tmp);
3533 StrStripBrackets (tmp);
3534 StrStripSpaces (tmp);
3535 pr.fulltitle = tmp;
3536
3537 pr.journal = GetBestJournal (cgp->journal);
3538 pr.imp = NULL;
3539
3540 dp = cgp->date;
3541 if (dp != NULL && dp->data [0] == 1) {
3542 year = (Int2) dp->data [1] + 1900;
3543 }
3544 if (year == 0) {
3545 year = cfp->year;
3546 }
3547 pr.year = year;
3548 pr.pmid = 0;
3549
3550 if (pr.authors != NULL && pr.titlewords != NULL) {
3551 TryEntrezQueries (cfp, &pr);
3552 }
3553
3554 ValNodeFreeData (pr.authors);
3555 ValNodeFreeData (pr.titlewords);
3556 MemFree (pr.fulltitle);
3557 MemFree (pr.uniquestr);
3558 }
3559
3560 static void DoUnpublishedReport (
3561 SeqEntryPtr sep,
3562 CleanFlagPtr cfp
3563 )
3564
3565 {
3566 if (sep == NULL || cfp == NULL) return;
3567
3568 cfp->unpubcount = 0;
3569 VisitPubdescsInSep (sep, (Pointer) cfp, CountUnpubPub);
3570 VisitPubdescsInSep (sep, (Pointer) cfp, ProcessUnpubPub);
3571 }
3572
3573 static void CountPublishedPub (
3574 PubdescPtr pdp,
3575 Pointer userdata
3576 )
3577
3578 {
3579 CleanFlagPtr cfp;
3580 CitArtPtr cap = NULL;
3581 ValNodePtr vnp;
3582
3583 if (pdp == NULL) return;
3584 cfp = (CleanFlagPtr) userdata;
3585 if (cfp == NULL) return;
3586
3587 for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3588 if (vnp->choice == PUB_Article) {
3589 cap = (CitArtPtr) vnp->data.ptrvalue;
3590 } else if (vnp->choice == PUB_PMid) {
3591 return;
3592 }
3593 }
3594
3595 if (cap == NULL) return;
3596
3597 (cfp->unpubcount)++;
3598 }
3599
3600 static void ProcessPublishedPub (
3601 PubdescPtr pdp,
3602 Pointer userdata
3603 )
3604
3605 {
3606 CleanFlagPtr cfp;
3607 CitArtPtr cap = NULL;
3608 CitJourPtr cjp;
3609 DatePtr dp = NULL;
3610 ImprintPtr imp;
3611 PubRef pr;
3612 CharPtr str;
3613 CharPtr tmp;
3614 ValNodePtr vnp;
3615 Int2 year = 0;
3616
3617 if (pdp == NULL) return;
3618 cfp = (CleanFlagPtr) userdata;
3619 if (cfp == NULL) return;
3620
3621 for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3622 if (vnp->choice == PUB_Article) {
3623 cap = (CitArtPtr) vnp->data.ptrvalue;
3624 } else if (vnp->choice == PUB_PMid) {
3625 return;
3626 }
3627 }
3628
3629 if (cap == NULL) return;
3630
3631 MemSet ((Pointer) &pr, 0, sizeof (PubRef));
3632
3633 pr.authors = GetAuthorMLNameList (cap->authors);
3634 for (vnp = cap->title; vnp != NULL; vnp = vnp->next) {
3635 if (vnp->choice == Cit_title_name) {
3636 str = (CharPtr) vnp->data.ptrvalue;
3637 if (StringHasNoText (str)) continue;
3638 pr.titlewords = GetTitleWords (str);
3639 tmp = StringSave (str);
3640 TrimSpacesAndJunkFromEnds (tmp, TRUE);
3641 s_RemovePeriodFromEnd (tmp);
3642 StrStripBrackets (tmp);
3643 StrStripSpaces (tmp);
3644 pr.fulltitle = tmp;
3645 }
3646 }
3647
3648 if (cap->from == 1) {
3649 cjp = (CitJourPtr) cap->fromptr;
3650 if (cjp != NULL) {
3651 pr.journal = GetBestJournal (cjp->title);
3652 imp = cjp->imp;
3653 pr.imp = imp;
3654 if (imp != NULL) {
3655 dp = imp->date;
3656 if (dp != NULL && dp->data [0] == 1) {
3657 year = (Int2) dp->data [1] + 1900;
3658 }
3659 }
3660 }
3661 }
3662
3663 if (year == 0) {
3664 year = cfp->year;
3665 }
3666 pr.year = year;
3667 pr.pmid = 0;
3668
3669 if (pr.authors != NULL && pr.titlewords != NULL) {
3670 TryEntrezQueries (cfp, &pr);
3671 }
3672
3673 ValNodeFreeData (pr.authors);
3674 ValNodeFreeData (pr.titlewords);
3675 MemFree (pr.fulltitle);
3676 }
3677
3678 static void DoPublishedReport (
3679 SeqEntryPtr sep,
3680 CleanFlagPtr cfp
3681 )
3682
3683 {
3684 if (sep == NULL || cfp == NULL) return;
3685
3686 cfp->unpubcount = 0;
3687 VisitPubdescsInSep (sep, (Pointer) cfp, CountPublishedPub);
3688 VisitPubdescsInSep (sep, (Pointer) cfp, ProcessPublishedPub);
3689 }
3690
3691 static void RemoveFeatureCitations (
3692 SeqFeatPtr sfp,
3693 Pointer userdata
3694 )
3695
3696 {
3697 if (sfp == NULL || sfp->cit == NULL) return;
3698
3699 sfp->cit = PubSetFree (sfp->cit);
3700 }
3701
3702 #ifdef OS_UNIX
3703 static SeqEntryPtr CppBasicCleanup (
3704 SeqEntryPtr sep,
3705 CleanFlagPtr cfp
3706 )
3707
3708 {
3709 AsnIoPtr aip, aop;
3710 ByteStorePtr bs1, bs2;
3711 Char cmmd [512];
3712 SeqEntryPtr csep, nsep;
3713 Char path1 [PATH_MAX];
3714 Char path2 [PATH_MAX];
3715 Char path3 [PATH_MAX];
3716
3717 if (sep == NULL || cfp == NULL) return NULL;
3718
3719 VisitFeaturesInSep (sep, NULL, RemoveFeatureCitations);
3720
3721 TmpNam (path1);
3722 TmpNam (path2);
3723 TmpNam (path3);
3724
3725 aop = AsnIoOpen (path1, "w");
3726 SeqEntryAsnWrite (sep, aop, NULL);
3727 AsnIoClose (aop);
3728
3729 sprintf (cmmd, "%s -i %s | cleanasn -a e -o %s",
3730 "~/ncbi_cxx/compilers/xCode/build/bin/Debug/test_basic_cleanup",
3731 path1, path2);
3732 system (cmmd);
3733
3734 sprintf (cmmd, "cleanasn -i %s -o %s -K b",
3735 path1, path3);
3736 system (cmmd);
3737
3738 aip = AsnIoOpen (path3, "r");
3739 csep = SeqEntryAsnRead (aip, NULL);
3740 AsnIoClose (aip);
3741
3742 bs1 = Se2Bs (csep);
3743
3744 aip = AsnIoOpen (path2, "r");
3745 nsep = SeqEntryAsnRead (aip, NULL);
3746 AsnIoClose (aip);
3747
3748 bs2 = Se2Bs (nsep);
3749
3750 if (nsep == NULL) {
3751 if (cfp->logfp != NULL) {
3752 fprintf (cfp->logfp, "EMPTY %s\n", cfp->buf);
3753 fflush (cfp->logfp);
3754 }
3755 } else if (! BSEqual (bs1, bs2)) {
3756 if (cfp->logfp != NULL) {
3757 fprintf (cfp->logfp, "BSEC DIFF %s\n", cfp->buf);
3758 fflush (cfp->logfp);
3759 }
3760 if (cfp->gi > 0) {
3761 sprintf (cmmd, "echo '' >> ~/Desktop/diffclean.txt");
3762 system (cmmd);
3763 sprintf (cmmd, "echo '' >> ~/Desktop/diffclean.txt");
3764 system (cmmd);
3765 sprintf (cmmd, "echo '********** gi|%ld **********' >> ~/Desktop/diffclean.txt", (long) cfp->gi);
3766 system (cmmd);
3767 sprintf (cmmd, "echo '' >> ~/Desktop/diffclean.txt");
3768 system (cmmd);
3769 sprintf (cmmd, "diff %s %s >> ~/Desktop/diffclean.txt", path3, path2);
3770 system (cmmd);
3771 }
3772 }
3773
3774 BSFree (bs1);
3775 BSFree (bs2);
3776
3777 SeqEntryFree (csep);
3778
3779 sprintf (cmmd, "rm %s; rm %s; rm %s", path1, path2, path3);
3780 system (cmmd);
3781
3782 return nsep;
3783 }
3784 #endif
3785
3786 /* now only strips serials for local, general, refseq, and 2+6 genbank ids */
3787 static void CheckForSwissProtIDX (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
3788
3789 {
3790 BioseqPtr bsp;
3791 SeqIdPtr sip;
3792 BoolPtr stripSerial;
3793 TextSeqIdPtr tsip;
3794
3795 if (sep == NULL) return;
3796 if (IS_Bioseq (sep)) {
3797 bsp = (BioseqPtr) sep->data.ptrvalue;
3798 if (bsp == NULL) return;
3799 stripSerial = (BoolPtr) mydata;
3800 if (stripSerial == NULL) return;
3801 for (sip = bsp->id; sip != NULL; sip = sip->next) {
3802 switch (sip->choice) {
3803 case SEQID_GIBBSQ :
3804 case SEQID_GIBBMT :
3805 *stripSerial = FALSE;
3806 break;
3807 case SEQID_EMBL :
3808 case SEQID_PIR :
3809 case SEQID_SWISSPROT :
3810 case SEQID_PATENT :
3811 case SEQID_DDBJ :
3812 case SEQID_PRF :
3813 case SEQID_PDB :
3814 case SEQID_TPE:
3815 case SEQID_TPD:
3816 case SEQID_GPIPE:
3817 *stripSerial = FALSE;
3818 break;
3819 case SEQID_GENBANK :
3820 case SEQID_TPG:
3821 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3822 if (tsip != NULL) {
3823 if (StringLen (tsip->accession) == 6) {
3824 *stripSerial = FALSE;
3825 }
3826 }
3827 break;
3828 case SEQID_NOT_SET :
3829 case SEQID_LOCAL :
3830 case SEQID_OTHER :
3831 case SEQID_GENERAL :
3832 break;
3833 default :
3834 break;
3835 }
3836 }
3837 }
3838 }
3839
3840 static time_t DoCleanup (
3841 SeqEntryPtr sep,
3842 Uint2 entityID,
3843 CleanFlagPtr cfp,
3844 AsnIoPtr aop,
3845 AsnTypePtr atp,
3846 SeqSubmitPtr ssp
3847 )
3848
3849 {
3850 BioseqPtr bsp;
3851 DatePtr dp;
3852 SeqEntryPtr fsep, nsep = NULL;
3853 Boolean isDdbj = FALSE, isEmbl = FALSE, isGenBank = FALSE, isNcbi = FALSE, isRefSeq = FALSE;
3854 Int4 nucs, prts;
3855 SumDataPtr sdp;
3856 SeqIdPtr sip, siphead;
3857 time_t starttime, stoptime;
3858 Boolean stripSerial = TRUE;
3859 SeqDescrPtr vnp;
3860
3861 if (sep == NULL || cfp == NULL) return 0;
3862
3863 AssignIDsInEntityEx (entityID, 0, NULL, NULL);
3864
3865 starttime = GetSecs ();
3866
3867 StringCpy (cfp->buf, "");
3868 cfp->gi = 0;
3869 cfp->year = 0;
3870 cfp->isRefSeq = FALSE;
3871
3872 fsep = FindNthBioseq (sep, 1);
3873 if (fsep != NULL && fsep->choice == 1) {
3874 bsp = (BioseqPtr) fsep->data.ptrvalue;
3875 if (bsp != NULL) {
3876 siphead = SeqIdSetDup (bsp->id);
3877 for (sip = siphead; sip != NULL; sip = sip->next) {
3878 SeqIdStripLocus (sip);
3879 if (sip->choice == SEQID_GI) {
3880 cfp->gi = (Int4) sip->data.intvalue;
3881 } else if (sip->choice == SEQID_GENBANK || sip->choice == SEQID_TPG) {
3882 isGenBank = TRUE;
3883 isNcbi = TRUE;
3884 } else if (sip->choice == SEQID_EMBL || sip->choice == SEQID_TPE) {
3885 isEmbl = TRUE;
3886 } else if (sip->choice == SEQID_DDBJ || sip->choice == SEQID_TPD) {
3887 isDdbj = TRUE;
3888 } else if (sip->choice == SEQID_OTHER) {
3889 isRefSeq = TRUE;
3890 isNcbi = TRUE;
3891 cfp->isRefSeq = TRUE;
3892 }
3893 }
3894 SeqIdWrite (siphead, cfp->buf, PRINTID_FASTA_LONG, sizeof (cfp->buf));
3895 SeqIdSetFree (siphead);
3896 }
3897 vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_update_date, NULL);
3898 if (vnp == NULL) {
3899 vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_create_date, NULL);
3900 }
3901 if (vnp != NULL) {
3902 dp = (DatePtr) vnp->data.ptrvalue;
3903 if (dp != NULL && dp->data [0] == 1) {
3904 cfp->year = (Int2) dp->data [1] + 1900;
3905 }
3906 }
3907 }
3908
3909 SeqEntryExplore (sep, (Pointer) &stripSerial, CheckForSwissProtIDX);
3910 cfp->stripSerial = stripSerial;
3911
3912 if (StringDoesHaveText (cfp->sourcedb)) {
3913 if (StringChr (cfp->sourcedb, 'g') != NULL) {
3914 if (! isGenBank) return 0;
3915 }
3916 if (StringChr (cfp->sourcedb, 'e') != NULL) {
3917 if (! isEmbl) return 0;
3918 }
3919 if (StringChr (cfp->sourcedb, 'd') != NULL) {
3920 if (! isDdbj) return 0;
3921 }
3922 if (StringChr (cfp->sourcedb, 'r') != NULL) {
3923 if (! isRefSeq) return 0;
3924 }
3925 if (StringChr (cfp->sourcedb, 'n') != NULL) {
3926 if (! isNcbi) return 0;
3927 }
3928 if (StringChr (cfp->sourcedb, 'x') != NULL) {
3929 if (isEmbl || isDdbj) return 0;
3930 }
3931 }
3932
3933 nucs = VisitSequencesInSep (sep, NULL, VISIT_NUCS, NULL);
3934 prts = VisitSequencesInSep (sep, NULL, VISIT_PROTS, NULL);
3935 cfp->rawcounts.nucs += nucs;
3936 cfp->rawcounts.prts += prts;
3937 (cfp->rawcounts.recs)++;
3938 cfp->cumcounts.nucs += nucs;
3939 cfp->cumcounts.prts += prts;
3940 (cfp->cumcounts.recs)++;
3941
3942 sdp = NULL;
3943 if (isGenBank) {
3944 sdp = &(cfp->dbsums.genbank);
3945 } else if (isEmbl) {
3946 sdp = &(cfp->dbsums.embl);
3947 } else if (isDdbj) {
3948 sdp = &(cfp->dbsums.ddbj);
3949 } else if (isRefSeq) {
3950 sdp = &(cfp->dbsums.refseq);
3951 } else {
3952 sdp = &(cfp->dbsums.other);
3953 }
3954 if (sdp != NULL) {
3955 sdp->nucs += nucs;
3956 sdp->prts += prts;
3957 (sdp->recs)++;
3958 }
3959
3960 if (StringChr (cfp->report, 'c') != NULL) {
3961 return 0;
3962 }
3963 if (StringChr (cfp->report, 'r') != NULL) {
3964 DoASNReport (sep, cfp, FALSE, FALSE);
3965 stoptime = GetSecs ();
3966 return stoptime - starttime;
3967 }
3968 if (StringChr (cfp->report, 's') != NULL) {
3969 DoASNReport (sep, cfp, TRUE, FALSE);
3970 stoptime = GetSecs ();
3971 return stoptime - starttime;
3972 }
3973 if (StringChr (cfp->report, 'n') != NULL) {
3974 DoASNReport (sep, cfp, TRUE, TRUE);
3975 stoptime = GetSecs ();
3976 return stoptime - starttime;
3977 }
3978 if (StringChr (cfp->report, 'd') != NULL) {
3979 DoAsnDiffReport (sep, cfp);
3980 stoptime = GetSecs ();
3981 return stoptime - starttime;
3982 }
3983 if (StringChr (cfp->report, 'g') != NULL) {
3984 DoGBFFReport (sep, cfp, 1);
3985 stoptime = GetSecs ();
3986 return stoptime - starttime;
3987 }
3988 if (StringChr (cfp->report, 'f') != NULL) {
3989 DoGBFFReport (sep, cfp, 2);
3990 stoptime = GetSecs ();
3991 return stoptime - starttime;
3992 }
3993 if (StringChr (cfp->report, 'v') != NULL) {
3994 DoValidatorReport (sep, cfp->logfp, cfp->buf, cfp->asnval);
3995 stoptime = GetSecs ();
3996 return stoptime - starttime;
3997 }
3998 if (StringChr (cfp->report, 'm') != NULL) {
3999 DoModernizeReport (sep, cfp);
4000 stoptime = GetSecs ();
4001 return stoptime - starttime;
4002 }
4003 if (StringChr (cfp->report, 'u') != NULL) {
4004 DoUnpublishedReport (sep, cfp);
4005 stoptime = GetSecs ();
4006 return stoptime - starttime;
4007 }
4008 if (StringChr (cfp->report, 'p') != NULL) {
4009 DoPublishedReport (sep, cfp);
4010 stoptime = GetSecs ();
4011 return stoptime - starttime;
4012 }
4013
4014 if (StringDoesHaveText (cfp->report)) return 0;
4015
4016 if (cfp->logfp != NULL) {
4017 fprintf (cfp->logfp, "%s\n", cfp->buf);
4018 fflush (cfp->logfp);
4019 }
4020
4021 if (StringChr (cfp->clean, 'b') != NULL) {
4022 BasicSeqEntryCleanup (sep);
4023 }
4024 #ifdef OS_UNIX
4025 if (StringChr (cfp->clean, 'p') != NULL) {
4026 nsep = CppBasicCleanup (sep, cfp);
4027 }
4028 #endif
4029 if (StringChr (cfp->clean, 's') != NULL) {
4030 SeriousSeqEntryCleanup (sep, NULL, NULL);
4031 }
4032 if (StringChr (cfp->clean, 'g') != NULL) {
4033 GpipeSeqEntryCleanup (sep);
4034 }
4035 if (StringChr (cfp->clean, 'n') != NULL) {
4036 NormalizeDescriptorOrder (sep);
4037 }
4038 if (StringChr (cfp->clean, 'u') != NULL) {
4039 RemoveAllNcbiCleanupUserObjects (sep);
4040 }
4041
4042 if (StringChr (cfp->modernize, 'g') != NULL) {
4043 VisitFeaturesInSep (sep, NULL, ModGenes);
4044 }
4045 if (StringChr (cfp->modernize, 'r') != NULL) {
4046 VisitFeaturesInSep (sep, NULL, ModRNAs);
4047 }
4048 if (StringChr (cfp->modernize, 'p') != NULL) {
4049 VisitBioSourcesInSep (sep, NULL, ModPCRs);
4050 }
4051
4052 if (cfp->taxon) {
4053 Taxon3ReplaceOrgInSeqEntry (sep, FALSE);
4054 }
4055
4056 if (cfp->pub) {
4057 VisitPubdescsInSep (sep, NULL, LookupPubdesc);
4058 }
4059
4060 if (StringChr (cfp->link, 'o') != NULL) {
4061 SeqMgrIndexFeatures (entityID, 0);
4062 LinkCDSmRNAbyOverlap (sep);
4063 }
4064 if (StringChr (cfp->link, 'p') != NULL) {
4065 SeqMgrIndexFeatures (entityID, 0);
4066 LinkCDSmRNAbyProduct (sep);
4067 }
4068 if (StringChr (cfp->link, 'r') != NULL) {
4069 SeqMgrIndexFeatures (entityID, 0);
4070 ReassignFeatureIDs (sep);
4071 }
4072 if (StringChr (cfp->link, 'c') != NULL) {
4073 ClearFeatureIDs (sep);
4074 }
4075
4076 if (StringChr (cfp->feat, 'u') != NULL) {
4077 VisitFeaturesInSep (sep, NULL, RemoveFeatUser);
4078 }
4079 if (StringChr (cfp->feat, 'd') != NULL) {
4080 VisitFeaturesInSep (sep, NULL, RemoveFeatDbxref);
4081 }
4082 if (StringChr (cfp->feat, 'r') != NULL) {
4083 SeqMgrIndexFeatures (entityID, 0);
4084 VisitFeaturesInSep (sep, NULL, RemoveUnnecGeneXref);
4085 }
4086
4087 if (StringChr (cfp->desc, 't') != NULL) {
4088 VisitDescriptorsInSep (sep, NULL, MarkTitles);
4089 DeleteMarkedObjects (entityID, 0, NULL);
4090 }
4091
4092 if (StringChr (cfp->mods, 'd') != NULL) {
4093 SeqMgrIndexFeatures (entityID, 0);
4094 DoAutoDef (sep, entityID);
4095 }
4096
4097 if (cfp->action_list != NULL) {
4098 ApplyMacroToSeqEntry (sep, cfp->action_list, NULL, NULL);
4099 }
4100
4101 stoptime = GetSecs ();
4102
4103 if (aop != NULL) {
4104 if (ssp != NULL) {
4105 SeqSubmitAsnWrite (ssp, aop, atp);
4106 } else if (nsep != NULL) {
4107 SeqEntryAsnWrite (nsep, aop, atp);
4108 SeqEntryFree (nsep);
4109 } else {
4110 SeqEntryAsnWrite (sep, aop, atp);
4111 }
4112 }
4113
4114 return stoptime - starttime;
4115 }
4116
4117 static void CleanupSingleRecord (
4118 CharPtr filename,
4119 CleanFlagPtr cfp
4120 )
4121
4122 {
4123 AsnIoPtr aip, aop = NULL;
4124 BioseqPtr bsp;
4125 BioseqSetPtr bssp;
4126 Pointer dataptr = NULL;
4127 Uint2 datatype, entityID = 0;
4128 FILE *fp;
4129 Char path [PATH_MAX];
4130 CharPtr ptr;
4131 SeqEntryPtr sep;
4132 SeqSubmitPtr ssp = NULL;
4133
4134 if (cfp == NULL) return;
4135
4136 if (StringHasNoText (filename)) return;
4137
4138 if (cfp->type == 1) {
4139 fp = FileOpen (filename, "r");
4140 if (fp == NULL) {
4141 Message (MSG_POSTERR, "Failed to open '%s'", filename);
4142 return;
4143 }
4144
4145 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE);
4146
4147 FileClose (fp);
4148
4149 entityID = ObjMgrRegister (datatype, dataptr);
4150
4151 } else if (cfp->type >= 2 && cfp->type <= 5) {
4152 aip = AsnIoOpen (filename, cfp->binary? "rb" : "r");
4153 if (aip == NULL) {
4154 Message (MSG_POSTERR, "AsnIoOpen failed for input file '%s'", filename);
4155 return;
4156 }
4157
4158 SeqMgrHoldIndexing (TRUE);
4159 switch (cfp->type) {
4160 case 2 :
4161 dataptr = (Pointer) SeqEntryAsnRead (aip, NULL);
4162 datatype = OBJ_SEQENTRY;
4163 break;
4164 case 3 :
4165 dataptr = (Pointer) BioseqAsnRead (aip, NULL);
4166 datatype = OBJ_BIOSEQ;
4167 break;
4168 case 4 :
4169 dataptr = (Pointer) BioseqSetAsnRead (aip, NULL);
4170 datatype = OBJ_BIOSEQSET;
4171 break;
4172 case 5 :
4173 dataptr = (Pointer) SeqSubmitAsnRead (aip, NULL);
4174 ssp = (SeqSubmitPtr) dataptr;
4175 datatype = OBJ_SEQSUB;
4176 break;
4177 default :
4178 break;
4179 }
4180 SeqMgrHoldIndexing (FALSE);
4181
4182 AsnIoClose (aip);
4183
4184 entityID = ObjMgrRegister (datatype, dataptr);
4185
4186 } else {
4187 Message (MSG_POSTERR, "Input format type '%d' unrecognized", (int) cfp->type);
4188 return;
4189 }
4190
4191 if (entityID < 1 || dataptr == NULL) {
4192 Message (MSG_POSTERR, "Data read failed for input file '%s'", filename);
4193 return;
4194 }
4195
4196 if (datatype == OBJ_SEQSUB || datatype == OBJ_SEQENTRY ||
4197 datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) {
4198
4199 sep = GetTopSeqEntryForEntityID (entityID);
4200
4201 if (sep == NULL) {
4202 sep = SeqEntryNew ();
4203 if (sep != NULL) {
4204 if (datatype == OBJ_BIOSEQ) {
4205 bsp = (BioseqPtr) dataptr;
4206 sep->choice = 1;
4207 sep->data.ptrvalue = bsp;
4208 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
4209 } else if (datatype == OBJ_BIOSEQSET) {
4210 bssp = (BioseqSetPtr) dataptr;
4211 sep->choice = 2;
4212 sep->data.ptrvalue = bssp;
4213 SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
4214 } else {
4215 sep = SeqEntryFree (sep);
4216 }
4217 }
4218 sep = GetTopSeqEntryForEntityID (entityID);
4219 }
4220
4221 if (sep != NULL) {
4222
4223 path [0] = '\0';
4224 if (StringDoesHaveText (cfp->outfile)) {
4225
4226 StringNCpy_0 (path, cfp->outfile, sizeof (path));
4227
4228 } else if (StringDoesHaveText (cfp->results)) {
4229
4230 ptr = StringRChr (filename, DIRDELIMCHR);
4231 if (ptr != NULL) {
4232 StringNCpy_0 (path, cfp->results, sizeof (path));
4233 ptr++;
4234 FileBuildPath (path, NULL, ptr);
4235 }
4236 }
4237
4238 sep = GetTopSeqEntryForEntityID (entityID);
4239 if (sep != NULL) {
4240
4241 if (StringHasNoText (cfp->report) && StringDoesHaveText (path)) {
4242 aop = AsnIoOpen (path, "w");
4243 }
4244
4245 DoCleanup (sep, entityID, cfp, aop, NULL, ssp);
4246
4247 if (aop != NULL) {
4248 AsnIoFlush (aop);
4249 AsnIoClose (aop);
4250 }
4251 }
4252
4253 ObjMgrFreeByEntityID (entityID);
4254 }
4255
4256 } else {
4257
4258 Message (MSG_POSTERR, "Datatype %d not recognized", (int) datatype);
4259 }
4260 }
4261
4262 static void CleanupMultipleRecord (
4263 CharPtr filename,
4264 CleanFlagPtr cfp
4265 )
4266
4267 {
4268 AsnIoPtr aip, aop = NULL;
4269 AsnTypePtr atp;
4270 DataVal av;
4271 Char ch;
4272 Uint2 entityID;
4273 FILE *fp;
4274 size_t len;
4275 Char longest [64];
4276 Int4 numrecords;
4277 Char path [PATH_MAX];
4278 CharPtr ptr;
4279 SeqEntryPtr sep;
4280 time_t timediff, worsttime;
4281 #ifdef OS_UNIX
4282 Char cmmd [512];
4283 CharPtr gzcatprog;
4284 int ret;
4285 Boolean usedPopen = FALSE;
4286 #endif
4287
4288 if (cfp == NULL) return;
4289
4290 if (StringHasNoText (filename)) return;
4291
4292 path [0] = '\0';
4293 if (StringDoesHaveText (cfp->outfile)) {
4294
4295 StringNCpy_0 (path, cfp->outfile, sizeof (path));
4296
4297 } else if (StringDoesHaveText (cfp->results)) {
4298
4299 ptr = StringRChr (filename, DIRDELIMCHR);
4300 if (ptr != NULL) {
4301 StringNCpy_0 (path, cfp->results, sizeof (path));
4302 ptr++;
4303 if (cfp->compressed) {
4304 len = StringLen (ptr);
4305 if (len > 4 && StringCmp (ptr + len - 3, ".gz") == 0) {
4306 ptr [len - 3] = '\0';
4307 }
4308 }
4309 FileBuildPath (path, NULL, ptr);
4310 }
4311 }
4312 if (StringHasNoText (cfp->report) && StringHasNoText (path)) return;
4313
4314 #ifndef OS_UNIX
4315 if (cfp->compressed) {
4316 Message (MSG_POSTERR, "Can only decompress on-the-fly on UNIX machines");
4317 return;
4318 }
4319 #endif
4320
4321 #ifdef OS_UNIX
4322 if (cfp->compressed) {
4323 gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
4324 if (gzcatprog != NULL) {
4325 sprintf (cmmd, "%s %s", gzcatprog, filename);
4326 } else {
4327 ret = system ("gzcat -h >/dev/null 2>&1");
4328 if (ret == 0) {
4329 sprintf (cmmd, "gzcat %s", filename);
4330 } else if (ret == -1) {
4331 Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
4332 return;
4333 } else {
4334 ret = system ("zcat -h >/dev/null 2>&1");
4335 if (ret == 0) {
4336 sprintf (cmmd, "zcat %s", filename);
4337 } else if (ret == -1) {
4338 Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease");
4339 return;
4340 } else {
4341 Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
4342 return;
4343 }
4344 }
4345 }
4346 fp = popen (cmmd, /* cfp->binary? "rb" : */ "r");
4347 usedPopen = TRUE;
4348 } else {
4349 fp = FileOpen (filename, cfp->binary? "rb" : "r");
4350 }
4351 #else
4352 fp = FileOpen (filename, cfp->binary? "rb" : "r");
4353 #endif
4354 if (fp == NULL) {
4355 Message (MSG_POSTERR, "FileOpen failed for input file '%s'", filename);
4356 return;
4357 }
4358
4359 aip = AsnIoNew (cfp->binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
4360 if (aip == NULL) {
4361 Message (MSG_ERROR, "AsnIoNew failed for input file '%s'", filename);
4362 return;
4363 }
4364
4365 if (cfp->logfp != NULL) {
4366 if (StringChr (cfp->report, 'c') == NULL) {
4367 fprintf (cfp->logfp, "%s\n\n", filename);
4368 fflush (cfp->logfp);
4369 }
4370 }
4371
4372 longest [0] = '\0';
4373 worsttime = 0;
4374 numrecords = 0;
4375
4376 if (StringHasNoText (cfp->report)) {
4377 aop = AsnIoOpen (path, cfp->binary? "wb" : "w");
4378 if (aop != NULL) {
4379 AsnOpenStruct (aop, cfp->bssp_atp, (Pointer) &(cfp->bss));
4380 av.intvalue = 7;
4381 AsnWrite (aop, cfp->atp_bsc, &av);
4382 AsnOpenStruct (aop, cfp->atp_bsss, (Pointer) &(cfp->bss.seq_set));
4383 }
4384 }
4385
4386 atp = cfp->atp_bss;
4387
4388 while ((atp = AsnReadId (aip, cfp->amp, atp)) != NULL) {
4389 if (atp == cfp->atp_se) {
4390
4391 SeqMgrHoldIndexing (TRUE);
4392 sep = SeqEntryAsnRead (aip, atp);
4393 SeqMgrHoldIndexing (FALSE);
4394
4395 if (sep != NULL) {
4396
4397 entityID = ObjMgrGetEntityIDForChoice (sep);
4398
4399 timediff = DoCleanup (sep, entityID, cfp, aop, cfp->atp_se, NULL);
4400
4401 if (timediff > worsttime) {
4402 worsttime = timediff;
4403 StringCpy (longest, cfp->buf);
4404 ptr = longest;
4405 ch = *ptr;
4406 while (ch != '\0') {
4407 if (ch == '|') {
4408 *ptr = ' ';
4409 }
4410 ptr++;
4411 ch = *ptr;
4412 }
4413 }
4414 numrecords++;
4415
4416 ObjMgrFreeByEntityID (entityID);
4417 }
4418
4419 } else {
4420
4421 AsnReadVal (aip, atp, NULL);
4422 }
4423 }
4424
4425 if (aop != NULL) {
4426 AsnCloseStruct (aop, cfp->atp_bsss, (Pointer) &(cfp->bss.seq_set));
4427 AsnCloseStruct (aop, cfp->bssp_atp, (Pointer) &(cfp->bss));
4428 AsnIoClose (aop);
4429 }
4430
4431 AsnIoFree (aip, FALSE);
4432
4433 #ifdef OS_UNIX
4434 if (usedPopen) {
4435 pclose (fp);
4436 } else {
4437 FileClose (fp);
4438 }
4439 #else
4440 FileClose (fp);
4441 #endif
4442 if (cfp->logfp != NULL) {
4443 if (StringChr (cfp->report, 'c') == NULL) {
4444 fprintf (cfp->logfp, "\nTotal number of records %ld\n", (long) numrecords);
4445 if (StringDoesHaveText (longest)) {
4446 fprintf (cfp->logfp, "Longest processing time %ld seconds on %s\n",
4447 (long) worsttime, longest);
4448 }
4449 fprintf (cfp->logfp, "Counts ");
4450 fprintf (cfp->logfp, "- %9ld RECS", (long) cfp->rawcounts.recs);
4451 fprintf (cfp->logfp, ", %9ld NUCS", (long) cfp->rawcounts.nucs);
4452 fprintf (cfp->logfp, ", %9ld PRTS", (long) cfp->rawcounts.prts);
4453 fprintf (cfp->logfp, ", %9ld OKAY", (long) cfp->rawcounts.okay);
4454 fprintf (cfp->logfp, ", %9ld NORM", (long) cfp->rawcounts.norm);
4455 fprintf (cfp->logfp, ", %9ld CLNR", (long) cfp->rawcounts.clnr);
4456 fprintf (cfp->logfp, ", %9ld OTHR", (long) cfp->rawcounts.othr);
4457 fprintf (cfp->logfp, ", %9ld MODR", (long) cfp->rawcounts.modr);
4458 fprintf (cfp->logfp, ", %9ld SLOC", (long) cfp->rawcounts.sloc);
4459 fprintf (cfp->logfp, ", %9ld PUBL", (long) cfp->rawcounts.publ);
4460 fprintf (cfp->logfp, ", %9ld AUTH", (long) cfp->rawcounts.auth);
4461 fprintf (cfp->logfp, ", %9ld SORT", (long) cfp->rawcounts.sort);
4462 fprintf (cfp->logfp, ", %9ld BSEC", (long) cfp->rawcounts.bsec);
4463 fprintf (cfp->logfp, ", %9ld GBBK", (long) cfp->rawcounts.gbbk);
4464 fprintf (cfp->logfp, ", %9ld TITL", (long) cfp->rawcounts.titl);
4465 fprintf (cfp->logfp, ", %9ld PACK", (long) cfp->rawcounts.pack);
4466 fprintf (cfp->logfp, ", %9ld MOVE", (long) cfp->rawcounts.move);
4467 fprintf (cfp->logfp, ", %9ld SSEC", (long) cfp->rawcounts.ssec);
4468 fprintf (cfp->logfp, "\n");
4469 fflush (cfp->logfp);
4470 }
4471 }
4472 }
4473
4474 static void CleanupOneRecord (
4475 CharPtr filename,
4476 Pointer userdata
4477 )
4478
4479 {
4480 CleanFlagPtr cfp;
4481 CharPtr ptr;
4482 SumDataPtr sdp;
4483
4484 if (StringHasNoText (filename)) return;
4485 cfp = (CleanFlagPtr) userdata;
4486 if (cfp == NULL) return;
4487
4488 MemSet ((Pointer) &(cfp->rawcounts), 0, sizeof (CountData));
4489 MemSet ((Pointer) &(cfp->dbsums), 0, sizeof (DbSumData));
4490
4491 if (StringChr (cfp->sourcedb, 'y') != NULL) {
4492 ptr = StringRChr (filename, DIRDELIMCHR);
4493 if (ptr != NULL) {
4494 ptr++;
4495 if (StringStr (ptr, "gbcon") != NULL ||
4496 StringStr (ptr, "gbest") != NULL ||
4497 StringStr (ptr, "gbgss") != NULL ||
4498 StringStr (ptr, "gbhtg") != NULL ||
4499 StringStr (ptr, "gbpat") != NULL ||
4500 StringStr (ptr, "gbsts") != NULL) return;
4501 }
4502 }
4503
4504 if (cfp->batch) {
4505 ptr = StringRChr (filename, DIRDELIMCHR);
4506 if (ptr != NULL) {
4507 ptr++;
4508 if (StringDoesHaveText (cfp->firstfile)) {
4509 if (StringICmp (cfp->firstfile, ptr) == 0) {
4510 cfp->foundfirst = TRUE;
4511 }
4512 if (! cfp->foundfirst) return;
4513 }
4514
4515 if (StringDoesHaveText (cfp->lastfile)) {
4516 if (cfp->foundlast) return;
4517 if (StringICmp (cfp->lastfile, ptr) == 0) {
4518 cfp->foundlast = TRUE;
4519 }
4520 }
4521 }
4522
4523 CleanupMultipleRecord (filename, cfp);
4524 } else {
4525 CleanupSingleRecord (filename, cfp);
4526 }
4527
4528 if (cfp->logfp != NULL) {
4529 if (StringChr (cfp->report, 'c') != NULL) {
4530 ptr = StringRChr (filename, DIRDELIMCHR);
4531 if (ptr != NULL) {
4532 ptr++;
4533 fprintf (cfp->logfp, "%s", ptr);
4534 }
4535 sdp = &(cfp->dbsums.genbank);
4536 if (sdp != NULL) {
4537 fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4538 }
4539 sdp = &(cfp->dbsums.embl);
4540 if (sdp != NULL) {
4541 fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4542 }
4543 sdp = &(cfp->dbsums.ddbj);
4544 if (sdp != NULL) {
4545 fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4546 }
4547 sdp = &(cfp->dbsums.refseq);
4548 if (sdp != NULL) {
4549 fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4550 }
4551 sdp = &(cfp->dbsums.other);
4552 if (sdp != NULL) {
4553 fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4554 }
4555 fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) cfp->rawcounts.recs, (long) cfp->rawcounts.nucs, (long) cfp->rawcounts.prts);
4556 fprintf (cfp->logfp, "\n");
4557 fflush (cfp->logfp);
4558 }
4559 }
4560 }
4561
4562 /* Args structure contains command-line arguments */
4563
4564 typedef enum {
4565 p_argInputPath = 0,
4566 r_argOutputPath,
4567 i_argInputFile,
4568 o_argOutputFile,
4569 f_argFilter,
4570 x_argSuffix,
4571 j_argFirstFile,
4572 k_argLastFile,
4573 d_argSourceDb,
4574 a_argType,
4575 b_argBinary,
4576 c_argCompressed,
4577 L_argLogFile,
4578 R_argRemote,
4579 Q_argReport,
4580 S_argSelective,
4581 m_argFfMode,
4582 q_argFfDiff,
4583 n_argAsn2Flat,
4584 v_argAsnVal,
4585 K_argClean,
4586 U_argModernize,
4587 N_argLink,
4588 F_argFeat,
4589 D_argDesc,
4590 X_argMods,
4591 M_argMacro,
4592 T_argTaxonLookup,
4593 P_argPubLookup
4594 } Arguments;
4595
4596 Args myargs [] = {
4597 {"Path to Files", NULL, NULL, NULL,
4598 TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
4599 {"Path for Results", NULL, NULL, NULL,
4600 TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
4601 {"Single Input File", "stdin", NULL, NULL,
4602 TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
4603 {"Single Output File", "stdout", NULL, NULL,
4604 TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
4605 {"Substring Filter", NULL, NULL, NULL,
4606 TRUE, 'f', ARG_STRING, 0.0, 0, NULL},
4607 {"File Selection Suffix", ".ent", NULL, NULL,
4608 TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
4609 {"First File Name", NULL, NULL, NULL,
4610 TRUE, 'j', ARG_STRING, 0.0, 0, NULL},
4611 {"Last File Name", NULL, NULL, NULL,
4612 TRUE, 'k', ARG_STRING, 0.0, 0, NULL},
4613 {"Source Database\n"
4614 " a Any\n"
4615 " g GenBank\n"
4616 " e EMBL\n"
4617 " d DDBJ\n"
4618 " r RefSeq\n"
4619 " n NCBI\n"
4620 " x Exclude EMBL/DDBJ\n"
4621 " y Exclude gbcon, gbest, gbgss, gbhtg, gbpat, gbsts\n", "a", NULL, NULL,
4622 TRUE, 'd', ARG_STRING, 0.0, 0, NULL},
4623 {"ASN.1 Type\n"
4624 " a Any\n"
4625 " e Seq-entry\n"
4626 " b Bioseq\n"
4627 " s Bioseq-set\n"
4628 " m Seq-submit\n"
4629 " t Batch Processing\n", "a", NULL, NULL,
4630 TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
4631 {"Bioseq-set is Binary", "F", NULL, NULL,
4632 TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
4633 {"Bioseq-set is Compressed", "F", NULL, NULL,
4634 TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
4635 {"Log File", NULL, NULL, NULL,
4636 TRUE, 'L', ARG_FILE_OUT, 0.0, 0, NULL},
4637 {"Remote Fetching from ID", "F", NULL, NULL,
4638 TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
4639 {"Report\n"
4640 " c Record Count\n"
4641 " r ASN.1 BSEC Report\n"
4642 " s ASN.1 SSEC Report\n"
4643 " n NORM vs. SSEC Report\n"
4644 " d Log SSEC Differences\n"
4645 " g GenBank SSEC Diff\n"
4646 " f asn2gb/asn2flat Diff\n"
4647 " v Validator SSEC Diff\n"
4648 " m Modernize Gene/RNA/PCR\n"
4649 " u Unpublished Pub Lookup\n"
4650 " p Published Pub Lookup\n", NULL, NULL, NULL,
4651 TRUE, 'Q', ARG_STRING, 0.0, 0, NULL},
4652 {"Selective Difference Filter\n"
4653 " a Author\n"
4654 " p Publication\n"
4655 " l Location\n"
4656 " r RNA\n"
4657 " s Qualifier Sort Order\n"
4658 " g Genbank Block\n"
4659 " k Package CdRegion or Parts Features\n"
4660 " m Move Publication\n"
4661 " (Capital Letters Skip)\n", NULL, NULL, NULL,
4662 TRUE, 'S', ARG_STRING, 0.0, 0, NULL},
4663 {"Flatfile Mode\n"
4664 " r Release\n"
4665 " e Entrez\n"
4666 " s Sequin\n"
4667 " d Dump\n", NULL, NULL, NULL,
4668 TRUE, 'm', ARG_STRING, 0.0, 0, NULL},
4669 {"ffdiff Executable", "/netopt/genbank/subtool/bin/ffdiff", NULL, NULL,
4670 TRUE, 'q', ARG_FILE_IN, 0.0, 0, NULL},
4671 {"asn2flat Executable", "/netopt/ncbi_tools/bin/asn2flat", NULL, NULL,
4672 TRUE, 'n', ARG_FILE_IN, 0.0, 0, NULL},
4673 {"asnval Executable", "/netopt/ncbi_tools/bin/asnval", NULL, NULL,
4674 TRUE, 'v', ARG_FILE_IN, 0.0, 0, NULL},
4675 {"Cleanup\n"
4676 " b BasicSeqEntryCleanup\n"
4677 " p C++ BasicCleanup\n"
4678 " s SeriousSeqEntryCleanup\n"
4679 " g GpipeSeqEntryCleanup\n"
4680 " n Normalize Descriptor Order\n"
4681 " u Remove NcbiCleanup User Objects\n", NULL, NULL, NULL,
4682 TRUE, 'K', ARG_STRING, 0.0, 0, NULL},
4683 {"Modernize\n"
4684 " g Gene\n"
4685 " r RNA\n"
4686 " p PCR Primers\n", NULL, NULL, NULL,
4687 TRUE, 'U', ARG_STRING, 0.0, 0, NULL},
4688 {"Link\n"
4689 " o LinkCDSmRNAbyOverlap\n"
4690 " p LinkCDSmRNAbyProduct\n"
4691 " r ReassignFeatureIDs\n"
4692 " c ClearFeatureIDs\n", NULL, NULL, NULL,
4693 TRUE, 'N', ARG_STRING, 0.0, 0, NULL},
4694 {"Feature\n"
4695 " u Remove User Object\n"
4696 " d Remove db_xref\n"
4697 " r Remove Redundant Gene xref\n", NULL, NULL, NULL,
4698 TRUE, 'F', ARG_STRING, 0.0, 0, NULL},
4699 {"Descriptor\n"
4700 " t Remove Title\n", NULL, NULL, NULL,
4701 TRUE, 'D', ARG_STRING, 0.0, 0, NULL},
4702 {"Miscellaneous\n"
4703 " d Automatic Definition Line\n", NULL, NULL, NULL,
4704 TRUE, 'X', ARG_STRING, 0.0, 0, NULL},
4705 {"Macro File", NULL, NULL, NULL,
4706 TRUE, 'M', ARG_FILE_IN, 0.0, 0, NULL},
4707 {"Taxonomy Lookup", "F", NULL, NULL,
4708 TRUE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},
4709 {"Publication Lookup", "F", NULL, NULL,
4710 TRUE, 'P', ARG_BOOLEAN, 0.0, 0, NULL},
4711 };
4712
4713 Int2 Main (void)
4714
4715 {
4716 ValNodePtr action_list;
4717 AsnIoPtr aip;
4718 Char app [64], mode, type;
4719 CleanFlagData cfd;
4720 CharPtr directory, filter, infile, logfile, outfile,
4721 macro_file, results, str, suffix;
4722 Boolean remote;
4723 time_t runtime, starttime, stoptime;
4724
4725 /* standard setup */
4726
4727 ErrSetFatalLevel (SEV_MAX);
4728 ErrClearOptFlags (EO_SHOW_USERSTR);
4729 UseLocalAsnloadDataAndErrMsg ();
4730 ErrPathReset ();
4731
4732 /* finish resolving internal connections in ASN.1 parse tables */
4733
4734 if (! AllObjLoad ()) {
4735 Message (MSG_FATAL, "AllObjLoad failed");
4736 return 1;
4737 }
4738 if (! SubmitAsnLoad ()) {
4739 Message (MSG_FATAL, "SubmitAsnLoad failed");
4740 return 1;
4741 }
4742 if (! FeatDefSetLoad ()) {
4743 Message (MSG_FATAL, "FeatDefSetLoad failed");
4744 return 1;
4745 }
4746 if (! SeqCodeSetLoad ()) {
4747 Message (MSG_FATAL, "SeqCodeSetLoad failed");
4748 return 1;
4749 }
4750 if (! GeneticCodeTableLoad ()) {
4751 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
4752 return 1;
4753 }
4754
4755 /* process command line arguments */
4756
4757 sprintf (app, "cleanasn %s", CLEANASN_APPLICATION);
4758 if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
4759 return 0;
4760 }
4761
4762 MemSet ((Pointer) &cfd, 0, sizeof (CleanFlagData));
4763
4764 directory = (CharPtr) myargs [p_argInputPath].strvalue;
4765 results = (CharPtr) myargs [r_argOutputPath].strvalue;
4766 if (StringHasNoText (results)) {
4767 results = directory;
4768 }
4769 infile = (CharPtr) myargs [i_argInputFile].strvalue;
4770 outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
4771 filter = (CharPtr) myargs [f_argFilter].strvalue;
4772 suffix = (CharPtr) myargs [x_argSuffix].strvalue;
4773
4774 cfd.batch = FALSE;
4775 cfd.binary = (Boolean) myargs [b_argBinary].intvalue;
4776 cfd.compressed = (Boolean) myargs [c_argCompressed].intvalue;
4777 cfd.type = 1;
4778
4779 cfd.foundfirst = FALSE;
4780 cfd.foundlast = FALSE;
4781 cfd.sourcedb = myargs [d_argSourceDb].strvalue;
4782
4783 str = myargs [a_argType].strvalue;
4784 TrimSpacesAroundString (str);
4785 if (StringDoesHaveText (str)) {
4786 type = str [0];
4787 } else {
4788 type = 'a';
4789 }
4790
4791 type = TO_LOWER (type);
4792 switch (type) {
4793 case 'a' :
4794 cfd.type = 1;
4795 break;
4796 case 'e' :
4797 cfd.type = 2;
4798 break;
4799 case 'b' :
4800 cfd.type = 3;
4801 break;
4802 case 's' :
4803 cfd.type = 4;
4804 break;
4805 case 'm' :
4806 cfd.type = 5;
4807 break;
4808 case 't' :
4809 cfd.type = 1;
4810 cfd.batch = TRUE;
4811 break;
4812 default :
4813 cfd.type = 1;
4814 break;
4815 }
4816
4817 remote = (Boolean) myargs [R_argRemote].intvalue;
4818
4819 cfd.report = myargs [Q_argReport].strvalue;
4820 cfd.selective = myargs [S_argSelective].strvalue;
4821 cfd.ffdiff = myargs [q_argFfDiff].strvalue;
4822 cfd.asn2flat = myargs [n_argAsn2Flat].strvalue;
4823 cfd.asnval = myargs [v_argAsnVal].strvalue;
4824
4825 str = myargs [m_argFfMode].strvalue;
4826 TrimSpacesAroundString (str);
4827 if (StringDoesHaveText (str)) {
4828 mode = str [0];
4829 } else {
4830 mode = 'e';
4831 }
4832
4833 mode = TO_LOWER (mode);
4834 switch (mode) {
4835 case 'r' :
4836 cfd.ffmode = RELEASE_MODE;
4837 break;
4838 case 'e' :
4839 cfd.ffmode = ENTREZ_MODE;
4840 break;
4841 case 's' :
4842 cfd.ffmode = SEQUIN_MODE;
4843 break;
4844 case 'd' :
4845 cfd.ffmode = DUMP_MODE;
4846 break;
4847 default :
4848 cfd.ffmode = ENTREZ_MODE;
4849 break;
4850 }
4851
4852 cfd.clean = myargs [K_argClean].strvalue;
4853 cfd.modernize = myargs [U_argModernize].strvalue;
4854 cfd.link = myargs [N_argLink].strvalue;
4855 cfd.feat = myargs [F_argFeat].strvalue;
4856 cfd.desc = myargs [D_argDesc].strvalue;
4857 cfd.mods = myargs [X_argMods].strvalue;
4858 cfd.taxon = (Boolean) myargs [T_argTaxonLookup].intvalue;
4859 cfd.pub = (Boolean) myargs [P_argPubLookup].intvalue;
4860
4861 macro_file = myargs [M_argMacro].strvalue;
4862 if (StringDoesHaveText (macro_file)) {
4863 aip = AsnIoOpen (macro_file, "r");
4864 if (aip == NULL) {
4865 Message (MSG_FATAL, "Unable to open macro file '%s'", macro_file);
4866 return 1;
4867 }
4868 action_list = MacroActionListAsnRead (aip, NULL);
4869 AsnIoClose (aip);
4870 if (action_list == NULL) {
4871 Message (MSG_FATAL, "Unable to read macro file '%s'", macro_file);
4872 }
4873 cfd.action_list = action_list;
4874 }
4875
4876 cfd.amp = AsnAllModPtr ();
4877 cfd.atp_bss = AsnFind ("Bioseq-set");
4878 cfd.atp_bsss = AsnFind ("Bioseq-set.seq-set");
4879 cfd.atp_se = AsnFind ("Bioseq-set.seq-set.E");
4880 cfd.atp_bsc = AsnFind ("Bioseq-set.class");
4881 cfd.bssp_atp = AsnLinkType (NULL, cfd.atp_bss);
4882
4883 logfile = (CharPtr) myargs [L_argLogFile].strvalue;
4884 if (StringDoesHaveText (logfile)) {
4885 cfd.logfp = FileOpen (logfile, "w");
4886 }
4887
4888 if (remote) {
4889 #ifdef INTERNAL_NCBI_CLEANASN
4890 if (! PUBSEQBioseqFetchEnable ("cleanasn", FALSE)) {
4891 Message (MSG_POSTERR, "PUBSEQBioseqFetchEnable failed");
4892 return 1;
4893 }
4894 #else
4895 PubSeqFetchEnable ();
4896 #endif
4897 }
4898
4899 if (remote || cfd.pub) {
4900 PubMedFetchEnable ();
4901 }
4902
4903 /*
4904 if (cfd.logfp != NULL && StringChr (cfd.report, 'c') != NULL) {
4905 fprintf (cfd.logfp, "FILE\t\tGENBANK\t\t\tEMBL\t\t\tDDBJ\t\t\tREFSEQ\t\t\tOTHER\n");
4906 fprintf (cfd.logfp, "\tREC\tNUC\tPRT\tREC\tNUC\tPRT\tREC\tNUC\tPRT\tREC\tNUC\tPRT\tREC\tNUC\tPRT\n");
4907 fflush (cfd.logfp);
4908 }
4909 */
4910
4911 starttime = GetSecs ();
4912
4913 if (StringDoesHaveText (directory)) {
4914 if (StringHasNoText (cfd.report) && StringCmp (directory, results) == 0) {
4915 Message (MSG_POSTERR, "-r results path must be different than -p data path");
4916 if (cfd.logfp != NULL) {
4917 fprintf (cfd.logfp, "-r results path must be different than -p data path\n");
4918 }
4919 } else {
4920
4921 cfd.firstfile = (CharPtr) myargs [j_argFirstFile].strvalue;
4922 cfd.lastfile = (CharPtr) myargs [k_argLastFile].strvalue;
4923
4924 cfd.results = results;
4925
4926 DirExplore (directory, filter, suffix, FALSE, CleanupOneRecord, (Pointer) &cfd);
4927 }
4928
4929 } else if (StringDoesHaveText (infile) && StringDoesHaveText (outfile)) {
4930
4931 cfd.outfile = outfile;
4932
4933 CleanupOneRecord (infile, (Pointer) &cfd);
4934 }
4935
4936 stoptime = GetSecs ();
4937 runtime = stoptime - starttime;
4938
4939 if (cfd.logfp != NULL) {
4940 if (StringChr (cfd.report, 'c') == NULL) {
4941 fprintf (cfd.logfp, "\nFinished in %ld seconds\n", (long) runtime);
4942 fprintf (cfd.logfp, "Cumulative counts ");
4943 fprintf (cfd.logfp, "- %9ld RECS", (long) cfd.cumcounts.recs);
4944 fprintf (cfd.logfp, ", %9ld NUCS", (long) cfd.cumcounts.nucs);
4945 fprintf (cfd.logfp, ", %9ld PRTS", (long) cfd.cumcounts.prts);
4946 fprintf (cfd.logfp, ", %9ld OKAY", (long) cfd.cumcounts.okay);
4947 fprintf (cfd.logfp, ", %9ld NORM", (long) cfd.cumcounts.norm);
4948 fprintf (cfd.logfp, ", %9ld CLNR", (long) cfd.cumcounts.clnr);
4949 fprintf (cfd.logfp, ", %9ld OTHR", (long) cfd.cumcounts.othr);
4950 fprintf (cfd.logfp, ", %9ld MODR", (long) cfd.cumcounts.modr);
4951 fprintf (cfd.logfp, ", %9ld SLOC", (long) cfd.cumcounts.sloc);
4952 fprintf (cfd.logfp, ", %9ld PUBL", (long) cfd.cumcounts.publ);
4953 fprintf (cfd.logfp, ", %9ld AUTH", (long) cfd.cumcounts.auth);
4954 fprintf (cfd.logfp, ", %9ld SORT", (long) cfd.cumcounts.sort);
4955 fprintf (cfd.logfp, ", %9ld BSEC", (long) cfd.cumcounts.bsec);
4956 fprintf (cfd.logfp, ", %9ld GBBK", (long) cfd.cumcounts.gbbk);
4957 fprintf (cfd.logfp, ", %9ld TITL", (long) cfd.cumcounts.titl);
4958 fprintf (cfd.logfp, ", %9ld PACK", (long) cfd.cumcounts.pack);
4959 fprintf (cfd.logfp, ", %9ld MOVE", (long) cfd.cumcounts.move);
4960 fprintf (cfd.logfp, ", %9ld SSEC", (long) cfd.cumcounts.ssec);
4961 fprintf (cfd.logfp, "\n");
4962 fflush (cfd.logfp);
4963 }
4964 FileClose (cfd.logfp);
4965 }
4966
4967 if (remote || cfd.pub) {
4968 PubMedFetchDisable ();
4969 }
4970
4971 if (remote) {
4972 #ifdef INTERNAL_NCBI_CLEANASN
4973 PUBSEQBioseqFetchDisable ();
4974 #else
4975 PubSeqFetchDisable ();
4976 #endif
4977 }
4978
4979 return 0;
4980 }
4981
4982 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |