NCBI C Toolkit Cross Reference

C/demo/cleanasn.c


  1 /*   cleanasn.c
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *            National Center for Biotechnology Information (NCBI)
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's official duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government do not place any restriction on its use or reproduction.
 13 *  We would, however, appreciate having the NCBI and the author cited in
 14 *  any work or product based on this material
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 * ===========================================================================
 25 *
 26 * File Name:  cleanasn.c
 27 *
 28 * Author:  Jonathan Kans
 29 *
 30 * Version Creation Date:   10/19/99
 31 *
 32 * $Revision: 6.106 $
 33 *
 34 * File Description: 
 35 *
 36 * Modifications:  
 37 * --------------------------------------------------------------------------
 38 * Date     Name        Description of modification
 39 * -------  ----------  -----------------------------------------------------
 40 *
 41 *
 42 * ==========================================================================
 43 */
 44 
 45 #include <ncbi.h>
 46 #include <objall.h>
 47 #include <objsset.h>
 48 #include <objfdef.h>
 49 #include <objsub.h>
 50 #include <sequtil.h>
 51 #include <gather.h>
 52 #include <sqnutils.h>
 53 #include <explore.h>
 54 #include <tofasta.h>
 55 #include <toasn3.h>
 56 #include <toporg.h>
 57 #include <subutil.h>
 58 #include <asn2gnbk.h>
 59 #include <pmfapi.h>
 60 #include <tax3api.h>
 61 #include <asn2gnbi.h>
 62 #include <ent2api.h>
 63 #ifdef INTERNAL_NCBI_CLEANASN
 64 #include <accpubseq.h>
 65 #endif
 66 #define NLM_GENERATED_CODE_PROTO
 67 #include <objmacro.h>
 68 #include <macroapi.h>
 69 
 70 #define CLEANASN_APP_VER "4.2"
 71 
 72 CharPtr CLEANASN_APPLICATION = CLEANASN_APP_VER;
 73 
 74 typedef struct sums {
 75   Int4          nucs;
 76   Int4          prts;
 77   Int4          recs;
 78 } SumData, PNTR SumDataPtr;
 79 
 80 typedef struct dbsums {
 81   SumData      genbank;
 82   SumData      embl;
 83   SumData      ddbj;
 84   SumData      refseq;
 85   SumData      other;
 86 } DbSumData, PNTR DbSumPtr;
 87 
 88 typedef struct counts {
 89   Int4          auth;
 90   Int4          bsec;
 91   Int4          clnr;
 92   Int4          gbbk;
 93   Int4          modr;
 94   Int4          move;
 95   Int4          norm;
 96   Int4          nucs;
 97   Int4          okay;
 98   Int4          othr;
 99   Int4          pack;
100   Int4          prts;
101   Int4          publ;
102   Int4          recs;
103   Int4          sloc;
104   Int4          sort;
105   Int4          ssec;
106   Int4          titl;
107 } CountData, PNTR CountDataPtr;
108 
109 typedef struct cleanflags {
110   Char          buf [64];
111   Int4          gi;
112   Int2          year;
113   Boolean       stripSerial;
114   Boolean       isRefSeq;
115   Boolean       batch;
116   Boolean       binary;
117   Boolean       compressed;
118   Int2          type;
119   CharPtr       results;
120   CharPtr       outfile;
121   CharPtr       firstfile;
122   CharPtr       lastfile;
123   Boolean       foundfirst;
124   Boolean       foundlast;
125   CharPtr       sourcedb;
126   CharPtr       report;
127   CharPtr       selective;
128   ModType       ffmode;
129   CharPtr       ffdiff;
130   CharPtr       asn2flat;
131   CharPtr       asnval;
132   CharPtr       clean;
133   CharPtr       modernize;
134   CharPtr       link;
135   CharPtr       feat;
136   CharPtr       desc;
137   CharPtr       mods;
138   ValNodePtr    action_list;
139   Boolean       taxon;
140   Boolean       pub;
141   Int4          unpubcount;
142   CountData     rawcounts;
143   CountData     cumcounts;
144   DbSumData     dbsums;
145   AsnModulePtr  amp;
146   AsnTypePtr    atp_bss;
147   AsnTypePtr    atp_bsss;
148   AsnTypePtr    atp_se;
149   AsnTypePtr    atp_bsc;
150   AsnTypePtr    bssp_atp;
151   BioseqSet     bss;
152   FILE          *logfp;
153 } CleanFlagData, PNTR CleanFlagPtr;
154 
155 static void RemoveFeatUser (
156   SeqFeatPtr sfp,
157   Pointer userdata
158 )
159 
160 {
161   if (sfp == NULL) return;
162   if (sfp->ext != NULL) {
163     sfp->ext = UserObjectFree (sfp->ext);
164   }
165 }
166 
167 static void RemoveFeatDbxref (
168   SeqFeatPtr sfp,
169   Pointer userdata
170 )
171 
172 {
173   DbtagPtr    dbt;
174   ValNodePtr  next, vnp;
175 
176   if (sfp == NULL) return;
177   for (vnp = sfp->dbxref; vnp != NULL; vnp = next) {
178     next = vnp->next;
179     dbt = (DbtagPtr) vnp->data.ptrvalue;
180     DbtagFree (dbt);
181     MemFree (vnp);
182   }
183   sfp->dbxref = NULL;
184 }
185 
186 typedef struct dummysmfedata {
187   Int4  max;
188   Int4  num_at_max;
189 } DummySmfeData, PNTR DummySmfePtr;
190 
191 static Boolean LIBCALLBACK CADummySMFEProc (
192   SeqFeatPtr sfp,
193   SeqMgrFeatContextPtr context
194 )
195 
196 
197 {
198   DummySmfePtr  dsp;
199   Int4          len;
200 
201   if (sfp == NULL || context == NULL) return TRUE;
202   dsp = context->userdata;
203   if (dsp == NULL) return TRUE;
204 
205   len = SeqLocLen (sfp->location);
206   if (len < dsp->max) {
207     dsp->max = len;
208     dsp->num_at_max = 1;
209   } else if (len == dsp->max) {
210     (dsp->num_at_max)++;
211   }
212 
213   return TRUE;
214 }
215 
216 static void RemoveUnnecGeneXref (
217   SeqFeatPtr sfp,
218   Pointer userdata
219 )
220 
221 {
222   Int2                 count;
223   SeqFeatXrefPtr       curr, next;
224   DummySmfeData        dsd;
225   SeqMgrFeatContext    fcontext;
226   SeqFeatXrefPtr PNTR  last;
227   GeneRefPtr           grp, grpx;
228   SeqFeatPtr           sfpx;
229   CharPtr              syn1, syn2;
230 
231   if (sfp == NULL || sfp->data.choice == SEQFEAT_GENE) return;
232   grp = SeqMgrGetGeneXref (sfp);
233   if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return;
234   sfpx = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
235   if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE) return;
236   grpx = (GeneRefPtr) sfpx->data.value.ptrvalue;
237   if (grpx == NULL) return;
238 
239   if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grpx->locus_tag)) {
240     if (StringICmp (grp->locus_tag, grpx->locus_tag) != 0) return;
241   } else if (StringDoesHaveText (grp->locus) && StringDoesHaveText (grpx->locus)) {
242     if (StringICmp (grp->locus, grpx->locus) != 0) return;
243   } else if (grp->syn != NULL && grpx->syn != NULL) {
244     syn1 = (CharPtr) grp->syn->data.ptrvalue;
245     syn2 = (CharPtr) grpx->syn->data.ptrvalue;
246     if (StringDoesHaveText (syn1) && StringDoesHaveText (syn2)) {
247       if (StringICmp (syn1, syn2) != 0) return;
248     }
249   }
250 
251   MemSet ((Pointer) &dsd, 0, sizeof (DummySmfeData));
252   dsd.max = INT4_MAX;
253   dsd.num_at_max = 0;
254   count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_GENE,
255                                            NULL, 0, LOCATION_SUBSET,
256                                            (Pointer) &dsd, CADummySMFEProc);
257 
258   if (dsd.num_at_max < 2) {
259     last = (SeqFeatXrefPtr PNTR) &(sfp->xref);
260     curr = sfp->xref;
261     while (curr != NULL) {
262       next = curr->next;
263       if (curr->data.choice == SEQFEAT_GENE) {
264         *last = next;
265         curr->next = NULL;
266         SeqFeatXrefFree (curr);
267       } else {
268         last = &(curr->next);
269       }
270       curr = next;
271     }
272   }
273 }
274 
275 static void MarkTitles (
276   SeqDescrPtr sdp,
277   Pointer userdata
278 )
279 
280 {
281   ObjValNodePtr  ovn;
282 
283   if (sdp == NULL || sdp->choice != Seq_descr_title) return;
284   if (sdp->extended == 0) return;
285   ovn = (ObjValNodePtr) sdp;
286   ovn->idx.deleteme = TRUE;
287 }
288 
289 static void DoAutoDef (
290   SeqEntryPtr sep,
291   Uint2 entityID
292 )
293 
294 {
295   ValNodePtr                    defline_clauses = NULL;
296   DeflineFeatureRequestList     feature_requests;
297   Int4                          index;
298   ValNodePtr                    modifier_indices = NULL;
299   ModifierItemLocalPtr          modList;
300   OrganismDescriptionModifiers  odmp;
301   SeqEntryPtr                   oldscope;
302 
303   if (sep == NULL) return;
304   if (entityID < 1) return;
305 
306   modList = MemNew (NumDefLineModifiers () * sizeof (ModifierItemLocalData));
307   if (modList == NULL) return;
308 
309   InitFeatureRequests (&feature_requests);
310 
311   SetRequiredModifiers (modList);
312   CountModifiers (modList, sep);
313 
314   InitOrganismDescriptionModifiers (&odmp, sep);
315 
316   RemoveNucProtSetTitles (sep);  
317   oldscope = SeqEntrySetScope (sep);
318 
319   BuildDefLineFeatClauseList (sep, entityID, &feature_requests,
320                               DEFAULT_ORGANELLE_CLAUSE, FALSE, FALSE,
321                               &defline_clauses);
322   if (AreFeatureClausesUnique (defline_clauses)) {
323     modifier_indices = GetModifierIndicesFromModList (modList);
324   } else {
325     modifier_indices = FindBestModifiers (sep, modList);
326   }
327 
328   BuildDefinitionLinesFromFeatureClauseLists (defline_clauses, modList,
329                                               modifier_indices, &odmp);
330   DefLineFeatClauseListFree (defline_clauses);
331   if (modList != NULL) {
332     for (index = 0; index < NumDefLineModifiers (); index++) {
333       ValNodeFree (modList [index].values_seen);
334     }
335     MemFree (modList);
336   }
337   modifier_indices = ValNodeFree (modifier_indices);
338 
339   ClearProteinTitlesInNucProts (entityID, NULL);
340   InstantiateProteinTitles (entityID, NULL);
341 
342   SeqEntrySetScope (oldscope);
343 }
344 
345 static void LookupPubdesc (
346   PubdescPtr pdp,
347   Pointer userdata
348 )
349 
350 {
351   CitArtPtr        cap;
352   MedlineEntryPtr  mep;
353   PubmedEntryPtr   pep;
354   Int4             pmid = 0;
355   ValNodePtr       vnp;
356 
357   if (pdp == NULL) return;
358 
359   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
360     switch (vnp->choice) {
361       case PUB_Muid :
362         /* ignore obsolete muids */
363         break;
364       case PUB_PMid :
365         pmid = vnp->data.intvalue;
366         break;
367       default :
368         /* return on real pub */
369         return;
370         break;
371     }
372   }
373 
374   if (pmid == 0) return;
375 
376   pep = GetPubMedForUid (pmid);
377   if (pep == NULL) return;
378   mep = (MedlineEntryPtr) pep->medent;
379   if (mep != NULL && mep->cit != NULL) {
380     cap = AsnIoMemCopy ((Pointer) mep->cit,
381                         (AsnReadFunc) CitArtAsnRead,
382                         (AsnWriteFunc) CitArtAsnWrite);
383     ValNodeAddPointer (&(pdp->pub), PUB_Article, (Pointer) cap);
384   }
385 
386   PubmedEntryFree (pep);
387 }
388 
389 static void CleanupLocation (
390   SeqFeatPtr sfp,
391   Pointer userdata
392 )
393 
394 {
395   BioseqPtr  bsp;
396   SeqIntPtr  sintp;
397   SeqLocPtr  slp;
398 
399   if (sfp == NULL || sfp->location == NULL) return;
400 
401   CleanUpSeqLoc (sfp->location);
402 
403   if (sfp->data.choice == SEQFEAT_REGION ||
404       sfp->data.choice == SEQFEAT_SITE ||
405       sfp->data.choice == SEQFEAT_BOND ||
406       sfp->data.choice == SEQFEAT_PROT) {
407     bsp = BioseqFind (SeqLocId (sfp->location));
408     if (bsp != NULL && ISA_aa (bsp->mol)) {
409       slp = SeqLocFindNext (sfp->location, NULL);
410       while (slp != NULL) {
411         if (slp->choice == SEQLOC_INT) {
412           sintp = (SeqIntPtr) slp->data.ptrvalue;
413           if (sintp != NULL) {
414             if (sintp->strand != Seq_strand_unknown) {
415               sintp->strand = Seq_strand_unknown;
416             }
417           }
418         }
419         slp = SeqLocFindNext (sfp->location, slp);
420       }
421     }
422   }
423 }
424 
425 static void CleanupMostRNAs (
426   SeqFeatPtr sfp,
427   Pointer userdata
428 )
429 
430 {
431   RnaRefPtr  rrp;
432 
433   if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return;
434   rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
435   if (rrp == NULL || rrp->type == 255) return;
436 
437   CleanUpSeqFeat (sfp, FALSE, FALSE, TRUE, FALSE, NULL);
438 }
439 
440 static void CleanupRemainingRNAs (
441   SeqFeatPtr sfp,
442   Pointer userdata
443 )
444 
445 {
446   if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return;
447 
448   CleanUpSeqFeat (sfp, FALSE, FALSE, TRUE, FALSE, NULL);
449 }
450 
451 static void CleanupPubAuthors (
452   PubdescPtr pdp,
453   Pointer userdata
454 )
455 
456 {
457   if (pdp == NULL) return;
458 
459   CleanUpPubdescAuthors (pdp);
460 }
461 
462 static void CleanupPubBody (
463   PubdescPtr pdp,
464   Pointer userdata
465 )
466 
467 {
468   CleanFlagPtr  cfp;
469 
470   if (pdp == NULL) return;
471   cfp = (CleanFlagPtr) userdata;
472   if (cfp == NULL) return;
473 
474   CleanUpPubdescBody (pdp, cfp->stripSerial);
475 }
476 
477 static void ModGenes (
478   SeqFeatPtr sfp,
479   Pointer userdata
480 )
481 
482 {
483   ModernizeGeneFields (sfp);
484 }
485 
486 static void ModRNAs (
487   SeqFeatPtr sfp,
488   Pointer userdata
489 )
490 
491 {
492   ModernizeRNAFields (sfp);
493 }
494 
495 static void ModPCRs (
496   BioSourcePtr biop,
497   Pointer userdata
498 )
499 
500 {
501   ModernizePCRPrimers (biop);
502 }
503 
504 static ByteStorePtr Se2Bs (
505   SeqEntryPtr sep
506 )
507 
508 {
509   AsnIoBSPtr    aibp;
510   ByteStorePtr  bs;
511 
512   if (sep == NULL) return NULL;
513 
514   bs = BSNew (1000);
515   if (bs == NULL) return NULL;
516   aibp = AsnIoBSOpen ("w", bs);
517   if (aibp == NULL || aibp->aip == NULL) return NULL;
518 
519   SeqEntryAsnWrite (sep, aibp->aip, NULL);
520 
521   AsnIoFlush (aibp->aip);
522   AsnIoBSClose (aibp);
523 
524   return bs;
525 }
526 
527 static ByteStorePtr Se2BsX (
528   SeqEntryPtr sep
529 )
530 
531 {
532   AsnIoBSPtr    aibp;
533   ByteStorePtr  bs;
534 
535   if (sep == NULL) return NULL;
536 
537   bs = BSNew (1000);
538   if (bs == NULL) return NULL;
539   aibp = AsnIoBSOpen ("w", bs);
540   if (aibp == NULL || aibp->aip == NULL) return NULL;
541 
542   aibp->aip->asn_no_newline = TRUE;
543   aibp->aip->asn_alt_struct = TRUE;
544 
545   SeqEntryAsnWrite (sep, aibp->aip, NULL);
546 
547   AsnIoFlush (aibp->aip);
548   AsnIoBSClose (aibp);
549 
550   return bs;
551 }
552 
553 /*
554 static CharPtr Se2Str (
555   SeqEntryPtr sep
556 )
557 
558 {
559   AsnIoBSPtr    aibp;
560   ByteStorePtr  bs;
561   CharPtr       str;
562 
563   if (sep == NULL) return NULL;
564 
565   bs = BSNew (1000);
566   if (bs == NULL) return NULL;
567   aibp = AsnIoBSOpen ("w", bs);
568   if (aibp == NULL) return NULL;
569 
570   SeqEntryAsnWrite (sep, aibp->aip, NULL);
571 
572   AsnIoFlush (aibp->aip);
573   AsnIoBSClose (aibp);
574 
575   str = BSMerge (bs, NULL);
576   BSFree (bs);
577 
578   return str;
579 }
580 */
581 
582 typedef struct chgdata {
583   Boolean       isRefSeq;
584   Boolean       sgml;
585   Boolean       cdscodon;
586   Boolean       rubisco;
587   Boolean       rbc;
588   Boolean       its;
589   Boolean       rnaother;
590   Boolean       trnanote;
591   Boolean       oldbiomol;
592   Boolean       oldgbqual;
593   Boolean       badDbxref;
594   Boolean       refDbxref;
595   Boolean       srcDbxref;
596   Boolean       capDbxref;
597   Boolean       oldDbxref;
598   Boolean       privDbxref;
599   Boolean       multDbxref;
600   Boolean       rareDbxref;
601   Boolean       badOrg;
602   Boolean       rpt_unit_seq;
603   Boolean       hasUnpublished;
604   Boolean       hasPublished;
605   Int4          protdesc;
606   Int4          sfpnote;
607   Int4          gbsource;
608   Int4          cdsconf;
609 } ChangeData, PNTR ChangeDataPtr;
610 
611 static Boolean IsRubisco (
612   CharPtr name
613 )
614 
615 {
616   return (StringICmp (name, "rubisco large subunit") == 0 ||
617           StringICmp (name, "rubisco small subunit") == 0);
618 }
619 
620 static Boolean IsRbc (
621   CharPtr name
622 )
623 
624 {
625   return (StringICmp (name, "RbcL") == 0 ||
626           StringICmp (name, "RbcS") == 0);
627 }
628 
629 static Boolean IsITS (
630   CharPtr name
631 )
632 
633 {
634   return (StringICmp (name, "its1") == 0 ||
635           StringICmp (name, "its 1") == 0 ||
636           StringICmp (name, "its2") == 0 ||
637           StringICmp (name, "its 2") == 0 ||
638           StringICmp (name, "its3") == 0 ||
639           StringICmp (name, "its 3") == 0 ||
640           StringICmp (name, "Ribosomal DNA internal transcribed spacer 1") == 0 ||
641           StringICmp (name, "Ribosomal DNA internal transcribed spacer 2") == 0 ||
642           StringICmp (name, "Ribosomal DNA internal transcribed spacer 3") == 0 ||
643           StringICmp (name, "internal transcribed spacer 1 (ITS1)") == 0 ||
644           StringICmp (name, "internal transcribed spacer 2 (ITS2)") == 0 ||
645           StringICmp (name, "internal transcribed spacer 3 (ITS3)") == 0);
646 }
647 
648 static Boolean HasSgml (
649   CharPtr str
650 )
651 
652 {
653   Int2  ascii_len;
654   Char  buf [1024];
655 
656   if (StringHasNoText (str)) return FALSE;
657 
658   ascii_len = Sgml2AsciiLen (str);
659   if (ascii_len + 2 > sizeof (buf)) return FALSE;
660 
661   Sgml2Ascii (str, buf, ascii_len + 1);
662   if (StringCmp (str, buf) != 0) {
663     return TRUE;
664   }
665 
666   return FALSE;
667 }
668 
669 static void LookForBadDbxref (
670   ValNodePtr list,
671   ChangeDataPtr cdp,
672   Boolean isSource
673 )
674 
675 {
676   Boolean      cap;
677   DbtagPtr     dp;
678   CharPtr      good;
679   ObjectIdPtr  oip;
680   Boolean      ref;
681   Boolean      src;
682   CharPtr      str;
683   ValNodePtr   vnp;
684 
685   if (list == NULL || cdp == NULL) return;
686 
687   for (vnp = list; vnp != NULL; vnp = vnp->next) {
688     dp = (DbtagPtr) vnp->data.ptrvalue;
689     if (dp != NULL && StringDoesHaveText (dp->db)) {
690 
691       oip = dp->tag;
692       if (oip != NULL && StringDoesHaveText (oip->str)) {
693         if (StringChr (oip->str, ':') != NULL) {
694           cdp->multDbxref = TRUE;
695         }
696       }
697 
698       str = dp->db;
699       if (StringICmp (str, "PID") == 0 ||
700           StringICmp (str, "PIDg") == 0 ||
701           StringICmp (str, "PIDd") == 0 ||
702           StringICmp (str, "PIDe") == 0 ||
703           StringICmp (str, "NID") == 0 ||
704           StringICmp (str, "GI") == 0) {
705         cdp->privDbxref = TRUE;
706         continue;
707       }
708       if (StringICmp (str, "SWISS-PROT") == 0 ||
709           StringICmp (str, "SWISSPROT") == 0 ||
710           StringICmp (str, "SPTREMBL") == 0 ||
711           StringICmp (str, "SUBTILIS") == 0 ||
712           StringICmp (str, "MGD") == 0 ||
713           StringCmp (str, "cdd") == 0 ||
714           StringICmp (str, "TrEMBL") == 0 ||
715           StringICmp (str, "LocusID") == 0 ||
716           StringICmp (str, "MaizeDB") == 0 ||
717           StringICmp (str, "UniProt/Swiss-Prot") == 0 ||
718           StringICmp (str, "UniProt/TrEMBL") == 0 ||
719           StringICmp (str, "Genew") == 0 ||
720           StringICmp (str, "GENEDB") == 0 ||
721           StringICmp (str, "GreengenesID") == 0 ||
722           StringICmp (str, "HMPID") == 0 ||
723           StringICmp (str, "IFO") == 0 ||
724           StringICmp (str, "BHB") == 0 ||
725           StringICmp (str, "BioHealthBase") == 0) {
726         cdp->oldDbxref = TRUE;
727         continue;
728       }
729       if (StringICmp (str, "ATCC(dna)") == 0 ||
730           StringICmp (str, "ATCC(in host)") == 0 ||
731           StringICmp (str, "BDGP_EST") == 0 ||
732           StringICmp (str, "BDGP_INS") == 0 ||
733           StringICmp (str, "CGNC") == 0 ||
734           StringICmp (str, "CloneID") == 0 ||
735           StringICmp (str, "ENSEMBL") == 0 ||
736           StringICmp (str, "ESTLIB") == 0 ||
737           StringICmp (str, "GDB") == 0 ||
738           /*
739           StringICmp (str, "GOA") == 0 ||
740           */
741           StringICmp (str, "IMGT/HLA") == 0 ||
742           StringICmp (str, "PIR") == 0 ||
743           StringICmp (str, "PSEUDO") == 0 ||
744           StringICmp (str, "RZPD") == 0 ||
745           StringICmp (str, "SoyBase") == 0 ||
746           StringICmp (str, "UNILIB") == 0) {
747         cdp->rareDbxref = TRUE;
748         continue;
749       }
750       if (StringICmp (str, "MGD") == 0 || StringICmp (str, "MGI") == 0) {
751         oip = dp->tag;
752         if (oip != NULL && StringDoesHaveText (oip->str)) {
753           str = oip->str;
754           if (StringNICmp (str, "MGI:", 4) == 0 || StringNICmp (str, "MGD:", 4) == 0) {
755             cdp->oldDbxref = TRUE;
756             continue;
757           }
758         }
759       } else if (StringICmp (str, "HPRD") == 0) {
760         oip = dp->tag;
761         if (oip != NULL && StringDoesHaveText (oip->str)) {
762           str = oip->str;
763           if (StringNICmp (str, "HPRD_", 5) == 0) {
764             cdp->oldDbxref = TRUE;
765             continue;
766           }
767         }
768       }
769 
770       if (isSource && StringCmp (str, "taxon") == 0) continue;
771 
772       if (DbxrefIsValid (str, &ref, &src, &cap, &good)) {
773         if (ref && (! cdp->isRefSeq)) {
774           cdp->refDbxref = TRUE;
775         }
776         if (isSource && (! src)) {
777           cdp->srcDbxref = TRUE;
778         }
779         if (cap) {
780           cdp->capDbxref = TRUE;
781         }
782       } else {
783         cdp->badDbxref = TRUE;
784       }
785     }
786   }
787 }
788 
789 static void ScoreFeature (
790   SeqFeatPtr sfp,
791   Pointer userdata
792 )
793 
794 {
795   BioSourcePtr   biop;
796   ChangeDataPtr  cdp;
797   Char           ch;
798   CharPtr        comment;
799   CdRegionPtr    crp;
800   CharPtr        desc;
801   GBQualPtr      gbq;
802   GeneRefPtr     grp;
803   CharPtr        name;
804   OrgRefPtr      orp;
805   ProtRefPtr     prp;
806   CharPtr        ptr;
807   Uint1          residue;
808   RnaRefPtr      rrp;
809   CharPtr        str;
810   ValNodePtr     vnp;
811 
812   if (sfp == NULL) return;
813    cdp = (ChangeDataPtr) userdata;
814    if (cdp == NULL) return;
815 
816   comment = sfp->comment;
817   if (StringDoesHaveText (comment)) {
818     (cdp->sfpnote)++;
819   }
820 
821   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
822     if (StringCmp (gbq->qual, "partial") == 0 ||
823         StringCmp (gbq->qual, "evidence") == 0 ||
824         StringCmp (gbq->qual, "exception") == 0 ||
825         StringCmp (gbq->qual, "note") == 0 ||
826         StringCmp (gbq->qual, "notes") == 0 ||
827         StringCmp (gbq->qual, "comment") == 0 ||
828         StringCmp (gbq->qual, "db_xref") == 0 ||
829         StringCmp (gbq->qual, "gdb_xref") == 0 ||
830         StringCmp (gbq->qual, "rpt_unit") == 0 ||
831         StringCmp (gbq->qual, "pseudo") == 0 ||
832         StringCmp (gbq->qual, "gene") == 0 ||
833         StringCmp (gbq->qual, "codon_start") == 0 ||
834         StringCmp (gbq->qual, "transposon") == 0 ||
835         StringCmp (gbq->qual, "insertion_seq") == 0) {
836       cdp->oldgbqual = TRUE;
837     } else if (StringICmp (gbq->qual, "rpt_unit_seq") == 0) {
838       if (StringHasNoText (gbq->val)) continue;
839       ptr = gbq->val;
840       ch = *ptr;
841       while (ch != '\0') {
842         if (IS_UPPER (ch)) {
843           cdp->rpt_unit_seq = TRUE;
844         }
845         ptr++;
846         ch = *ptr;
847       }
848     }
849   }
850 
851   LookForBadDbxref (sfp->dbxref, cdp, FALSE);
852 
853   /* skip feature types that do not use data.value.ptrvalue */
854   switch (sfp->data.choice) {
855     case SEQFEAT_COMMENT:
856     case SEQFEAT_BOND:
857     case SEQFEAT_SITE:
858     case SEQFEAT_PSEC_STR:
859       return;
860     default:
861       break;
862   }
863 
864   if (sfp->data.value.ptrvalue == NULL) return;
865 
866   switch (sfp->data.choice) {
867     case SEQFEAT_GENE:
868       grp = (GeneRefPtr) sfp->data.value.ptrvalue;
869       if (HasSgml (grp->locus)) {
870         cdp->sgml = TRUE;
871       }
872       if (HasSgml (grp->desc)) {
873         cdp->sgml = TRUE;
874       }
875       for (vnp = grp->syn; vnp != NULL; vnp = vnp->next) {
876         str = (CharPtr) vnp->data.ptrvalue;
877         if (StringHasNoText (str)) continue;
878         if (HasSgml (str)) {
879           cdp->sgml = TRUE;
880         }
881       }
882       for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
883         if (StringCmp (gbq->qual, "map") == 0 ||
884             StringCmp (gbq->qual, "allele") == 0 ||
885             StringCmp (gbq->qual, "locus_tag") == 0 ||
886             StringCmp (gbq->qual, "old_locus_tag") == 0) {
887           cdp->oldgbqual = TRUE;
888         }
889       }
890       LookForBadDbxref (grp->db, cdp, FALSE);
891       break;
892     case SEQFEAT_CDREGION:
893       crp = (CdRegionPtr) sfp->data.value.ptrvalue;
894       if (crp->conflict) {
895         (cdp->cdsconf)++;
896       }
897       for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
898         if (StringCmp (gbq->qual, "codon") != 0) continue;
899         if (StringHasNoText (gbq->val)) continue;
900         cdp->cdscodon = TRUE;
901       }
902       for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
903         if (StringCmp (gbq->qual, "product") == 0 ||
904             StringCmp (gbq->qual, "function") == 0 ||
905             StringCmp (gbq->qual, "EC_number") == 0 ||
906             StringCmp (gbq->qual, "prot_note") == 0) {
907           cdp->oldgbqual = TRUE;
908         }
909       }
910       break;
911     case SEQFEAT_PROT:
912       prp = (ProtRefPtr) sfp->data.value.ptrvalue;
913       desc = prp->desc;
914       if (StringDoesHaveText (desc)) {
915         (cdp->protdesc)++;
916       }
917       for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
918         str = (CharPtr) vnp->data.ptrvalue;
919         if (StringHasNoText (str)) continue;
920         if (IsRubisco (str)) {
921           cdp->rubisco = TRUE;
922         }
923         if (IsRbc (str)) {
924           cdp->rbc = TRUE;
925         }
926       }
927       for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
928         if (StringCmp (gbq->qual, "product") == 0 ||
929             StringCmp (gbq->qual, "function") == 0 ||
930             StringCmp (gbq->qual, "EC_number") == 0 ||
931             StringCmp (gbq->qual, "label") == 0 ||
932             StringCmp (gbq->qual, "allele") == 0) {
933           cdp->oldgbqual = TRUE;
934         }
935         if (StringCmp (gbq->qual, "standard_name") == 0 && prp->name == NULL) {
936           cdp->oldgbqual = TRUE;
937         }
938       }
939       LookForBadDbxref (prp->db, cdp, FALSE);
940       break;
941     case SEQFEAT_RNA :
942       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
943       if (rrp->type == 255 && rrp->ext.choice == 1) {
944         name = (CharPtr) rrp->ext.value.ptrvalue;
945         if (StringCmp (name, "misc_RNA") == 0) {
946           for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
947             if (StringCmp (gbq->qual, "product") != 0) continue;
948             name = gbq->val;
949             if (StringHasNoText (name)) continue;
950             if (IsITS (name)) {
951               cdp->its = TRUE;
952             }
953           }
954         } else if (StringCmp (name, "ncRNA") == 0 || StringCmp (name, "tmRNA") == 0) {
955         } else {
956           cdp->rnaother = TRUE;
957           if (IsITS (name)) {
958             cdp->its = TRUE;
959           }
960         }
961       } else if (rrp->type == 3 && rrp->ext.choice == 2) {
962         if (StringDoesHaveText (comment)) {
963           if (StringNCmp (comment, "aa: ", 4) == 0) {
964             comment += 4;
965           }
966           residue = FindTrnaAA3 (comment);
967           if (residue > 0 && residue != 255) {
968             cdp->trnanote = TRUE;
969           }
970           residue = FindTrnaAA (comment);
971           if (residue > 0 && residue != 255) {
972             cdp->trnanote = TRUE;
973           }
974         }
975       }
976       for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
977         if (StringCmp (gbq->qual, "product") == 0 ||
978             StringCmp (gbq->qual, "ncRNA_class") == 0 ||
979             StringCmp (gbq->qual, "tag_peptide") == 0) {
980           cdp->oldgbqual = TRUE;
981         }
982       }
983       break;
984     case SEQFEAT_ORG :
985       orp = (OrgRefPtr) sfp->data.value.ptrvalue;
986       LookForBadDbxref (orp->db, cdp, TRUE);
987       cdp->badOrg = TRUE;
988       break;
989     case SEQFEAT_BIOSRC :
990       biop = (BioSourcePtr) sfp->data.value.ptrvalue;
991       orp = biop->org;
992       if (orp != NULL) {
993         LookForBadDbxref (orp->db, cdp, TRUE);
994       }
995     default:
996       break;
997   }
998 }
999 
1000 static void ScoreDescriptor (
1001   SeqDescrPtr sdp,
1002   Pointer userdata
1003 )
1004 
1005 {
1006   BioSourcePtr   biop;
1007   ChangeDataPtr  cdp;
1008   GBBlockPtr     gbp;
1009   MolInfoPtr     mip;
1010   OrgRefPtr      orp;
1011 
1012   if (sdp == NULL) return;
1013   cdp = (ChangeDataPtr) userdata;
1014   if (cdp == NULL) return;
1015 
1016   switch (sdp->choice) {
1017     case Seq_descr_genbank :
1018       gbp = (GBBlockPtr) sdp->data.ptrvalue;
1019       if (gbp != NULL) {
1020         if (StringDoesHaveText (gbp->source)) {
1021           (cdp->gbsource)++;
1022         }
1023       }
1024       break;
1025     case Seq_descr_molinfo :
1026       mip = (MolInfoPtr) sdp->data.ptrvalue;
1027       if (mip != NULL) {
1028         switch (mip->biomol) {
1029           case MOLECULE_TYPE_SNRNA:
1030           case MOLECULE_TYPE_SCRNA:
1031           case MOLECULE_TYPE_SNORNA:
1032             cdp->oldbiomol = TRUE;
1033             break;
1034           default :
1035             break;
1036         }
1037       }
1038       break;
1039     case Seq_descr_org :
1040       orp = (OrgRefPtr) sdp->data.ptrvalue;
1041       if (orp != NULL) {
1042         LookForBadDbxref (orp->db, cdp, TRUE);
1043       }
1044       cdp->badOrg = TRUE;
1045       break;
1046     case Seq_descr_source :
1047       biop = (BioSourcePtr) sdp->data.ptrvalue;
1048       if (biop != NULL) {
1049         orp = biop->org;
1050         if (orp != NULL) {
1051           LookForBadDbxref (orp->db, cdp, TRUE);
1052         }
1053       }
1054       break;
1055     default :
1056       break;
1057   }
1058 }
1059 
1060 static void CheckForUnpubPub (
1061   PubdescPtr pdp,
1062   Pointer userdata
1063 )
1064 
1065 {
1066   ChangeDataPtr  cdp;
1067   CitGenPtr      cgp;
1068   ValNodePtr     vnp;
1069 
1070   if (pdp == NULL) return;
1071   cdp = (ChangeDataPtr) userdata;
1072   if (cdp == NULL) return;
1073 
1074   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
1075     if (vnp->choice == PUB_Gen) {
1076       cgp = (CitGenPtr) vnp->data.ptrvalue;
1077       if (cgp != NULL) {
1078         if (StringICmp (cgp->cit, "Unpublished") == 0) {
1079           if (StringICmp (cgp->title, "Direct Submission") != 0) {
1080             cdp->hasUnpublished = TRUE;
1081           }
1082         }
1083       }
1084     } else if (vnp->choice == PUB_Muid || vnp->choice == PUB_PMid) {
1085       cdp->hasPublished = TRUE;
1086     } else if (vnp->choice == PUB_Article || vnp->choice == PUB_Book || vnp->choice == PUB_Man) {
1087       cdp->hasPublished = TRUE;
1088     }
1089   }
1090 }
1091 
1092 static void CheckForChanges (
1093   SeqEntryPtr sep,
1094   ChangeDataPtr cdp
1095 )
1096 
1097 {
1098   if (sep == NULL || cdp == NULL) return;
1099 
1100   VisitFeaturesInSep (sep, (Pointer) cdp, ScoreFeature);
1101   VisitDescriptorsInSep (sep, (Pointer) cdp, ScoreDescriptor);
1102   VisitPubdescsInSep (sep, (Pointer) cdp, CheckForUnpubPub);
1103 }
1104 
1105 static void StripBadProtTitles (
1106   BioseqPtr bsp,
1107   Pointer userdata
1108 )
1109 
1110 {
1111   CharPtr            buf;
1112   size_t             buflen = 1001;
1113   ObjValNodePtr      ovp;
1114   SeqIdPtr           sip;
1115   CharPtr            title;
1116   ValNodePtr         vnp;
1117 
1118   if (bsp == NULL) return;
1119   if (! ISA_aa (bsp->mol)) return;
1120   for (sip = bsp->id; sip != NULL; sip = sip->next) {
1121     if (sip->choice == SEQID_OTHER) return;
1122   }
1123 
1124   vnp = BioseqGetSeqDescr (bsp, Seq_descr_title, NULL);
1125   if (vnp == NULL) return;
1126   title = (CharPtr) vnp->data.ptrvalue;
1127   if (StringHasNoText (title)) return;
1128 
1129   buf = MemNew (sizeof (Char) * (buflen + 1));
1130   if (buf == NULL) return;
1131 
1132   if (NewCreateDefLineBuf (NULL, bsp, buf, buflen, TRUE, FALSE)) {
1133     if (StringICmp (buf, title) != 0) {
1134       if (vnp->extended != 0) {
1135         ovp = (ObjValNodePtr) vnp;
1136         ovp->idx.deleteme = TRUE;
1137       }
1138     }
1139   }
1140 
1141   MemFree (buf);
1142 }
1143 
1144 static void BadProtTitleProc (
1145   SeqEntryPtr sep,
1146   Pointer mydata,
1147   Int4 index,
1148   Int2 indent
1149 )
1150 
1151 {
1152   BioseqSetPtr  bssp;
1153 
1154   if (sep == NULL) return;
1155   if (! IS_Bioseq_set (sep)) return;
1156   bssp = (BioseqSetPtr) sep->data.ptrvalue;
1157   if (bssp->_class != BioseqseqSet_class_nuc_prot) return;
1158   VisitBioseqsInSep (sep, NULL, StripBadProtTitles);
1159 }
1160 
1161 static void BSSaveToFile (
1162   ByteStorePtr bs,
1163   CharPtr path
1164 )
1165 
1166 {
1167   Byte  buf [256];
1168   Int4  count;
1169   FILE  *fp;
1170 
1171   if (bs == NULL || StringHasNoText (path)) return;
1172 
1173   fp = FileOpen (path, "w");
1174   if (fp != NULL) {
1175     Nlm_BSSeek (bs, 0, SEEK_SET);
1176     count = BSRead (bs, buf, sizeof (buf));
1177     while (count > 0) {
1178       FileWrite (buf, count, 1, fp);
1179       count = BSRead (bs, buf, sizeof (buf));
1180     }
1181     FileClose (fp);
1182   }
1183 }
1184 
1185 typedef struct diffblock {
1186   ValNodePtr  head;
1187   ValNodePtr  tail;
1188 } DiffBlock, PNTR DiffBlockPtr;
1189 
1190 static void RecordDiffBlock (
1191   DiffBlockPtr dbp,
1192   CharPtr str
1193 )
1194 
1195 {
1196   ValNodePtr  vnp;
1197 
1198   if (dbp == NULL || StringHasNoText (str)) return;
1199 
1200   vnp = ValNodeCopyStr (&(dbp->tail), 0, str);
1201   if (dbp->head == NULL) {
1202     dbp->head = vnp;
1203   }
1204   dbp->tail = vnp;
1205 }
1206 
1207 static void WriteDiffBlock (
1208   DiffBlockPtr dbp,
1209   FILE *fp
1210 )
1211 
1212 {
1213   Char        ch;
1214   Int2        idx;
1215   Int2        margin = INT2_MAX;
1216   CharPtr     ptr;
1217   Int2        spaces;
1218   CharPtr     str;
1219   ValNodePtr  vnp;
1220 
1221   if (dbp == NULL || dbp->head == NULL || fp == NULL) return;
1222 
1223   for (vnp = dbp->head; vnp != NULL; vnp = vnp->next) {
1224     str = (CharPtr) vnp->data.ptrvalue;
1225     if (StringHasNoText (str)) continue;
1226     ch = str [0];
1227     if (ch == '<' || ch == '>') {
1228       ptr = str + 1;
1229       ch = *ptr;
1230       spaces = 0;
1231       while (ch == ' ') {
1232         spaces++;
1233         ptr++;
1234         ch = *ptr;
1235       }
1236       if (spaces < margin) {
1237         margin = spaces;
1238       }
1239     }
1240   }
1241 
1242   if (margin > 80) {
1243     margin = 80;
1244   }
1245 
1246   for (vnp = dbp->head; vnp != NULL; vnp = vnp->next) {
1247     str = (CharPtr) vnp->data.ptrvalue;
1248     if (StringHasNoText (str)) continue;
1249     ch = str [0];
1250     if (ch == '<' || ch == '>') {
1251       ptr = str + 1;
1252       ch = *ptr;
1253       idx = 0;
1254       while (idx < margin && ch == ' ') {
1255         idx++;
1256         ptr++;
1257         ch = *ptr;
1258       }
1259       fprintf (fp, "%c %s\n", str [0], ptr);
1260     } else if (ch == '-') {
1261       fprintf (fp, "---\n");
1262     } else if (ch == '=') {
1263       fprintf (fp, "===\n");
1264     }
1265   }
1266 
1267   fprintf (fp, "\n");
1268   fflush (fp);
1269 }
1270 
1271 static void ResetDiffBlock (
1272   DiffBlockPtr dbp
1273 )
1274 
1275 {
1276   if (dbp == NULL) return;
1277 
1278   dbp->head = ValNodeFreeData (dbp->head);
1279   dbp->tail = NULL;
1280 }
1281 
1282 static void ReportAsnDiffs (
1283   FILE *logfp,
1284   CharPtr id,
1285   ByteStorePtr bs1,
1286   ByteStorePtr bs2)
1287 
1288 {
1289 #ifdef OS_UNIX
1290   Char       ch;
1291   Char       cmmd [512];
1292   DiffBlock  db;
1293   int        diff;
1294   FileCache  fc;
1295   FILE       *fp;
1296   Char       line [512];
1297   Char       path1 [PATH_MAX];
1298   Char       path2 [PATH_MAX];
1299   Char       path3 [PATH_MAX];
1300   CharPtr    str;
1301 
1302   if (logfp == NULL || StringHasNoText (id)) return;
1303   if (bs1 == NULL || bs2 == NULL) return;
1304 
1305   TmpNam (path1);
1306   TmpNam (path2);
1307   TmpNam (path3);
1308 
1309   BSSaveToFile (bs1, path1);
1310   BSSaveToFile (bs2, path2);
1311 
1312   db.head = NULL;
1313   db.tail = NULL;
1314 
1315   sprintf (cmmd, "diff -b -h %s %s > %s", path1, path2, path3);
1316   diff = system (cmmd);
1317 
1318   if (diff > 0) {
1319     fp = FileOpen (path3, "r");
1320     if (fp != NULL) {
1321       fprintf (logfp, "\n\n%s\n\n", id);
1322       fflush (logfp);
1323       if (FileCacheSetup (&fc, fp)) {
1324         str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
1325         while (str != NULL) {
1326           ch = line [0];
1327           if (ch == '<' || ch == '>') {
1328             RecordDiffBlock (&db, line);
1329           } else if (ch == '-') {
1330             RecordDiffBlock (&db, "---");
1331           } else if (IS_DIGIT (ch)) {
1332             WriteDiffBlock (&db, logfp);
1333             ResetDiffBlock (&db);
1334             RecordDiffBlock (&db, "===");
1335           } else if (StringHasNoText (str)) {
1336             WriteDiffBlock (&db, logfp);
1337             ResetDiffBlock (&db);
1338           }
1339           str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
1340         }
1341         WriteDiffBlock (&db, logfp);
1342         ResetDiffBlock (&db);
1343       }
1344       fprintf (logfp, "//\n\n");
1345       FileClose (fp);
1346     }
1347   }
1348 
1349   sprintf (cmmd, "rm %s; rm %s; rm %s", path1, path2, path3);
1350   system (cmmd);
1351 #endif
1352 }
1353 
1354 static void DoAsnDiffReport (
1355   SeqEntryPtr sep,
1356   CleanFlagPtr cfp
1357 )
1358 
1359 {
1360   ByteStorePtr  bs = NULL, tmp = NULL;
1361   Boolean       okay = FALSE;
1362 
1363   if (sep == NULL || cfp == NULL) return;
1364 
1365   RemoveAllNcbiCleanupUserObjects (sep);
1366 
1367   /* Capital letters avoid unwanted diffs on issues already fixed in ID */
1368 
1369   if (StringChr (cfp->selective, 'A') != NULL) {
1370     VisitPubdescsInSep (sep, NULL, CleanupPubAuthors);
1371   }
1372   if (StringChr (cfp->selective, 'P') != NULL) {
1373     VisitPubdescsInSep (sep, (Pointer) cfp, CleanupPubBody);
1374   }
1375   if (StringChr (cfp->selective, 'L') != NULL) {
1376     VisitFeaturesInSep (sep, NULL, CleanupLocation);
1377   }
1378   if (StringChr (cfp->selective, 'R') != NULL) {
1379     VisitFeaturesInSep (sep, NULL, CleanupMostRNAs);
1380     VisitFeaturesInSep (sep, NULL, CleanupRemainingRNAs);
1381     VisitFeaturesInSep (sep, NULL, ModRNAs);
1382   }
1383   if (StringChr (cfp->selective, 'S') != NULL) {
1384     SortSeqEntryQualifiers (sep);
1385   }
1386   if (StringChr (cfp->selective, 'G') != NULL) {
1387     EntryChangeGBSource (sep);
1388     EntryCheckGBBlock (sep);
1389   }
1390   if (StringChr (cfp->selective, 'K') != NULL) {
1391     MoveFeatsFromPartsSet (sep);
1392     move_cds_ex (sep, TRUE);
1393   }
1394   if (StringChr (cfp->selective, 'M') != NULL) {
1395     SeqEntryPubsAsn4 (sep);
1396   }
1397 
1398   NormalizeDescriptorOrder (sep);
1399 
1400   /* Look for change in single issue */
1401 
1402   bs = Se2BsX (sep);
1403   if (StringHasNoText (cfp->selective)) {
1404     okay = TRUE;
1405   } else if (StringChr (cfp->selective, 'a') != NULL) {
1406     VisitPubdescsInSep (sep, NULL, CleanupPubAuthors);
1407     tmp = Se2BsX (sep);
1408     if (! BSEqual (bs, tmp)) {
1409       okay = TRUE;
1410     }
1411     tmp = BSFree (tmp);
1412   } else if (StringChr (cfp->selective, 'p') != NULL) {
1413     VisitPubdescsInSep (sep, (Pointer) cfp, CleanupPubBody);
1414     tmp = Se2BsX (sep);
1415     if (! BSEqual (bs, tmp)) {
1416       okay = TRUE;
1417     }
1418     tmp = BSFree (tmp);
1419   } else if (StringChr (cfp->selective, 'l') != NULL) {
1420     VisitFeaturesInSep (sep, NULL, CleanupLocation);
1421     tmp = Se2BsX (sep);
1422     if (! BSEqual (bs, tmp)) {
1423       okay = TRUE;
1424     }
1425     tmp = BSFree (tmp);
1426   } else if (StringChr (cfp->selective, 'r') != NULL) {
1427     VisitFeaturesInSep (sep, NULL, CleanupMostRNAs);
1428     VisitFeaturesInSep (sep, NULL, CleanupRemainingRNAs);
1429     VisitFeaturesInSep (sep, NULL, ModRNAs);
1430     tmp = Se2BsX (sep);
1431     if (! BSEqual (bs, tmp)) {
1432       okay = TRUE;
1433     }
1434     tmp = BSFree (tmp);
1435   } else if (StringChr (cfp->selective, 's') != NULL) {
1436     SortSeqEntryQualifiers (sep);
1437     tmp = Se2BsX (sep);
1438     if (! BSEqual (bs, tmp)) {
1439       okay = TRUE;
1440     }
1441     tmp = BSFree (tmp);
1442   } else if (StringChr (cfp->selective, 'g') != NULL) {
1443     EntryChangeGBSource (sep);
1444     EntryCheckGBBlock (sep);
1445     tmp = Se2BsX (sep);
1446     if (! BSEqual (bs, tmp)) {
1447       okay = TRUE;
1448     }
1449     tmp = BSFree (tmp);
1450   } else if (StringChr (cfp->selective, 'k') != NULL) {
1451     MoveFeatsFromPartsSet (sep);
1452     move_cds_ex (sep, TRUE);
1453     tmp = Se2BsX (sep);
1454     if (! BSEqual (bs, tmp)) {
1455       okay = TRUE;
1456     }
1457     tmp = BSFree (tmp);
1458   } else if (StringChr (cfp->selective, 'm') != NULL) {
1459     SeqEntryPubsAsn4 (sep);
1460     NormalizeDescriptorOrder (sep);
1461     tmp = Se2BsX (sep);
1462     if (! BSEqual (bs, tmp)) {
1463       okay = TRUE;
1464     }
1465     tmp = BSFree (tmp);
1466   } else {
1467     okay = TRUE;
1468   }
1469 
1470   /* Report incremental diff */
1471 
1472   if (okay) {
1473     SeriousSeqEntryCleanup (sep, NULL, NULL);
1474     RemoveAllNcbiCleanupUserObjects (sep);
1475     NormalizeDescriptorOrder (sep);
1476     tmp = Se2BsX (sep);
1477     if (! BSEqual (bs, tmp)) {
1478       if (cfp->logfp != NULL) {
1479         ReportAsnDiffs (cfp->logfp, cfp->buf, bs, tmp);
1480       }
1481     }
1482     tmp = BSFree (tmp);
1483   }
1484 
1485   BSFree (bs);
1486 }
1487 
1488 static void DoASNReport (
1489   SeqEntryPtr sep,
1490   CleanFlagPtr cfp,
1491   Boolean dossec,
1492   Boolean quick
1493 )
1494 
1495 {
1496   Boolean       auth = FALSE, bsec = FALSE, clnr = FALSE, gbbk = FALSE,
1497                 modr = FALSE, move = FALSE, norm = FALSE, othr = FALSE,
1498                 pack = FALSE, publ = FALSE, ssec = FALSE, sloc = FALSE,
1499                 sort = FALSE, titl = FALSE, ncbiusrobj = FALSE;
1500   ByteStorePtr  bs = NULL, tmp = NULL;
1501   ChangeData    cdbefore, cdafter;
1502   Uint2         entityID;
1503 
1504   if (sep == NULL || cfp == NULL) return;
1505 
1506   if (FindNcbiCleanupUserObject (sep) != NULL) {
1507     ncbiusrobj = TRUE;
1508   }
1509 
1510   RemoveAllNcbiCleanupUserObjects (sep);
1511 
1512   if (quick) {
1513     bs = Se2Bs (sep);
1514 
1515     NormalizeDescriptorOrder (sep);
1516     tmp = Se2Bs (sep);
1517     if (! BSEqual (bs, tmp)) {
1518       norm = TRUE;
1519     }
1520     BSFree (bs);
1521     bs = tmp;
1522 
1523     SeriousSeqEntryCleanup (sep, NULL, NULL);
1524     RemoveAllNcbiCleanupUserObjects (sep);
1525     NormalizeDescriptorOrder (sep);
1526     tmp = Se2Bs (sep);
1527     if (! BSEqual (bs, tmp)) {
1528       ssec = TRUE;
1529     }
1530     BSFree (bs);
1531     bs = tmp;
1532 
1533     BSFree (bs);
1534 
1535     if (ssec) {
1536       (cfp->rawcounts.ssec)++;
1537       (cfp->cumcounts.ssec)++;
1538       if (cfp->logfp != NULL) {
1539         fprintf (cfp->logfp, "SSEC %s\n", cfp->buf);
1540         fflush (cfp->logfp);
1541       }
1542     } else if (norm) {
1543       (cfp->rawcounts.norm)++;
1544       (cfp->cumcounts.norm)++;
1545       if (cfp->logfp != NULL) {
1546         fprintf (cfp->logfp, "NORM %s\n", cfp->buf);
1547         fflush (cfp->logfp);
1548       }
1549     } else {
1550       (cfp->rawcounts.okay)++;
1551       (cfp->cumcounts.okay)++;
1552       if (cfp->logfp != NULL) {
1553         fprintf (cfp->logfp, "OKAY %s\n", cfp->buf);
1554         fflush (cfp->logfp);
1555       }
1556     }
1557 
1558     return;
1559   }
1560 
1561   MemSet ((Pointer) &cdbefore, 0, sizeof (ChangeData));
1562   MemSet ((Pointer) &cdafter, 0, sizeof (ChangeData));
1563 
1564   cdbefore.isRefSeq = cfp->isRefSeq;
1565   cdafter.isRefSeq = cfp->isRefSeq;
1566 
1567   CheckForChanges (sep, &cdbefore);
1568 
1569   bs = Se2Bs (sep);
1570 
1571   NormalizeDescriptorOrder (sep);
1572   tmp = Se2Bs (sep);
1573   if (! BSEqual (bs, tmp)) {
1574     norm = TRUE;
1575   }
1576   BSFree (bs);
1577   bs = tmp;
1578 
1579   VisitFeaturesInSep (sep, NULL, CleanupLocation);
1580   tmp = Se2Bs (sep);
1581   if (! BSEqual (bs, tmp)) {
1582     sloc = TRUE;
1583   }
1584   BSFree (bs);
1585   bs = tmp;
1586 
1587   VisitFeaturesInSep (sep, NULL, CleanupMostRNAs);
1588   tmp = Se2Bs (sep);
1589   if (! BSEqual (bs, tmp)) {
1590     clnr = TRUE;
1591   }
1592   BSFree (bs);
1593   bs = tmp;
1594 
1595   VisitFeaturesInSep (sep, NULL, CleanupRemainingRNAs);
1596   tmp = Se2Bs (sep);
1597   if (! BSEqual (bs, tmp)) {
1598     othr = TRUE;
1599   }
1600   BSFree (bs);
1601   bs = tmp;
1602 
1603   VisitFeaturesInSep (sep, NULL, ModRNAs);
1604   tmp = Se2Bs (sep);
1605   if (! BSEqual (bs, tmp)) {
1606     modr = TRUE;
1607   }
1608   BSFree (bs);
1609   bs = tmp;
1610 
1611   VisitPubdescsInSep (sep, NULL, CleanupPubAuthors);
1612   tmp = Se2Bs (sep);
1613   if (! BSEqual (bs, tmp)) {
1614     auth = TRUE;
1615   }
1616   BSFree (bs);
1617   bs = tmp;
1618 
1619   VisitPubdescsInSep (sep, (Pointer) cfp, CleanupPubBody);
1620   tmp = Se2Bs (sep);
1621   if (! BSEqual (bs, tmp)) {
1622     publ = TRUE;
1623   }
1624   BSFree (bs);
1625   bs = tmp;
1626 
1627   SortSeqEntryQualifiers (sep);
1628   tmp = Se2Bs (sep);
1629   if (! BSEqual (bs, tmp)) {
1630     sort = TRUE;
1631   }
1632   BSFree (bs);
1633   bs = tmp;
1634 
1635   BasicSeqEntryCleanup (sep);
1636   tmp = Se2Bs (sep);
1637   if (! BSEqual (bs, tmp)) {
1638     bsec = TRUE;
1639   }
1640   BSFree (bs);
1641   bs = tmp;
1642 
1643   EntryChangeGBSource (sep);
1644   EntryCheckGBBlock (sep);
1645   tmp = Se2Bs (sep);
1646   if (! BSEqual (bs, tmp)) {
1647     gbbk = TRUE;
1648   }
1649   BSFree (bs);
1650   bs = tmp;
1651 
1652   entityID = ObjMgrGetEntityIDForChoice (sep);
1653   SeqMgrIndexFeatures (entityID, NULL);
1654   SeqEntryExplore (sep, NULL, BadProtTitleProc);
1655   DeleteMarkedObjects (0, OBJ_SEQENTRY, (Pointer) sep);
1656   SeqMgrIndexFeatures (entityID, NULL);
1657   InstantiateProteinTitles (entityID, NULL);
1658   SeqMgrClearFeatureIndexes (entityID, NULL);
1659   BasicSeqEntryCleanup (sep);
1660   NormalizeDescriptorOrder (sep);
1661   tmp = Se2Bs (sep);
1662   if (! BSEqual (bs, tmp)) {
1663     titl = TRUE;
1664   }
1665   BSFree (bs);
1666   bs = tmp;
1667 
1668   MoveFeatsFromPartsSet (sep);
1669   move_cds_ex (sep, TRUE);
1670   tmp = Se2Bs (sep);
1671   if (! BSEqual (bs, tmp)) {
1672     pack = TRUE;
1673   }
1674   BSFree (bs);
1675   bs = tmp;
1676 
1677   SeqEntryPubsAsn4 (sep);
1678   NormalizeDescriptorOrder (sep);
1679   tmp = Se2Bs (sep);
1680   if (! BSEqual (bs, tmp)) {
1681     move = TRUE;
1682   }
1683   BSFree (bs);
1684   bs = tmp;
1685 
1686   if (dossec) {
1687     SeriousSeqEntryCleanup (sep, NULL, NULL);
1688     RemoveAllNcbiCleanupUserObjects (sep);
1689     NormalizeDescriptorOrder (sep);
1690     tmp = Se2Bs (sep);
1691     if (! BSEqual (bs, tmp)) {
1692       ssec = TRUE;
1693     }
1694     BSFree (bs);
1695     bs = tmp;
1696   }
1697 
1698   BSFree (bs);
1699 
1700   CheckForChanges (sep, &cdafter);
1701 
1702   if (ssec) {
1703     (cfp->rawcounts.ssec)++;
1704     (cfp->cumcounts.ssec)++;
1705     if (cfp->logfp != NULL) {
1706       fprintf (cfp->logfp, "SSEC %s\n", cfp->buf);
1707       fflush (cfp->logfp);
1708     }
1709   } else if (move) {
1710     (cfp->rawcounts.move)++;
1711     (cfp->cumcounts.move)++;
1712     if (cfp->logfp != NULL) {
1713       fprintf (cfp->logfp, "MOVE %s\n", cfp->buf);
1714       fflush (cfp->logfp);
1715     }
1716   } else if (pack) {
1717     (cfp->rawcounts.pack)++;
1718     (cfp->cumcounts.pack)++;
1719     if (cfp->logfp != NULL) {
1720       fprintf (cfp->logfp, "PACK %s\n", cfp->buf);
1721       fflush (cfp->logfp);
1722     }
1723   } else if (titl) {
1724     (cfp->rawcounts.titl)++;
1725     (cfp->cumcounts.titl)++;
1726     if (cfp->logfp != NULL) {
1727       fprintf (cfp->logfp, "TITL %s\n", cfp->buf);
1728       fflush (cfp->logfp);
1729     }
1730   } else if (gbbk) {
1731     (cfp->rawcounts.gbbk)++;
1732     (cfp->cumcounts.gbbk)++;
1733     if (cfp->logfp != NULL) {
1734       fprintf (cfp->logfp, "GBBK %s\n", cfp->buf);
1735       fflush (cfp->logfp);
1736     }
1737   } else if (bsec) {
1738     (cfp->rawcounts.bsec)++;
1739     (cfp->cumcounts.bsec)++;
1740     if (cfp->logfp != NULL) {
1741       fprintf (cfp->logfp, "BSEC %s\n", cfp->buf);
1742       fflush (cfp->logfp);
1743     }
1744   } else if (sort) {
1745     (cfp->rawcounts.sort)++;
1746     (cfp->cumcounts.sort)++;
1747     if (cfp->logfp != NULL) {
1748       fprintf (cfp->logfp, "SORT %s\n", cfp->buf);
1749       fflush (cfp->logfp);
1750     }
1751   } else if (sloc) {
1752     (cfp->rawcounts.sloc)++;
1753     (cfp->cumcounts.sloc)++;
1754     if (cfp->logfp != NULL) {
1755       fprintf (cfp->logfp, "SLOC %s\n", cfp->buf);
1756       fflush (cfp->logfp);
1757     }
1758   } else if (clnr) {
1759     (cfp->rawcounts.clnr)++;
1760     (cfp->cumcounts.clnr)++;
1761     if (cfp->logfp != NULL) {
1762       fprintf (cfp->logfp, "CLNR %s\n", cfp->buf);
1763       fflush (cfp->logfp);
1764     }
1765   } else if (othr) {
1766     (cfp->rawcounts.othr)++;
1767     (cfp->cumcounts.othr)++;
1768     if (cfp->logfp != NULL) {
1769       fprintf (cfp->logfp, "OTHR %s\n", cfp->buf);
1770       fflush (cfp->logfp);
1771     }
1772   } else if (modr) {
1773     (cfp->rawcounts.modr)++;
1774     (cfp->cumcounts.modr)++;
1775     if (cfp->logfp != NULL) {
1776       fprintf (cfp->logfp, "MODR %s\n", cfp->buf);
1777       fflush (cfp->logfp);
1778     }
1779   } else if (publ) {
1780     (cfp->rawcounts.publ)++;
1781     (cfp->cumcounts.publ)++;
1782     if (cfp->logfp != NULL) {
1783       fprintf (cfp->logfp, "PUBL %s\n", cfp->buf);
1784       fflush (cfp->logfp);
1785     }
1786   } else if (auth) {
1787     (cfp->rawcounts.auth)++;
1788     (cfp->cumcounts.auth)++;
1789     if (cfp->logfp != NULL) {
1790       fprintf (cfp->logfp, "AUTH %s\n", cfp->buf);
1791       fflush (cfp->logfp);
1792     }
1793   } else if (norm) {
1794     (cfp->rawcounts.norm)++;
1795     (cfp->cumcounts.norm)++;
1796     if (cfp->logfp != NULL) {
1797       fprintf (cfp->logfp, "NORM %s\n", cfp->buf);
1798       fflush (cfp->logfp);
1799     }
1800   } else {
1801     (cfp->rawcounts.okay)++;
1802     (cfp->cumcounts.okay)++;
1803     if (cfp->logfp != NULL) {
1804       fprintf (cfp->logfp, "OKAY %s\n", cfp->buf);
1805       fflush (cfp->logfp);
1806     }
1807   }
1808 
1809   if (cdbefore.oldgbqual) {
1810     if (cfp->logfp != NULL) {
1811       fprintf (cfp->logfp, "GBQ %s\n", cfp->buf);
1812       fflush (cfp->logfp);
1813     }
1814   }
1815   if (cdbefore.sgml) {
1816     if (cfp->logfp != NULL) {
1817       fprintf (cfp->logfp, "SGM %s\n", cfp->buf);
1818       fflush (cfp->logfp);
1819     }
1820   }
1821   if (cdbefore.cdscodon) {
1822     if (cfp->logfp != NULL) {
1823       fprintf (cfp->logfp, "CDN %s\n", cfp->buf);
1824       fflush (cfp->logfp);
1825     }
1826   }
1827   if (cdbefore.rubisco) {
1828     if (cfp->logfp != NULL) {
1829       fprintf (cfp->logfp, "RUB %s\n", cfp->buf);
1830       fflush (cfp->logfp);
1831     }
1832   }
1833   if (cdbefore.rbc) {
1834     if (cfp->logfp != NULL) {
1835       fprintf (cfp->logfp, "RBC %s\n", cfp->buf);
1836       fflush (cfp->logfp);
1837     }
1838   }
1839   if (cdbefore.its) {
1840     if (cfp->logfp != NULL) {
1841       fprintf (cfp->logfp, "ITS %s\n", cfp->buf);
1842       fflush (cfp->logfp);
1843     }
1844   }
1845   if (cdbefore.rnaother) {
1846     if (cfp->logfp != NULL) {
1847       fprintf (cfp->logfp, "RNA %s\n", cfp->buf);
1848       fflush (cfp->logfp);
1849     }
1850   }
1851   if (cdbefore.trnanote) {
1852     if (cfp->logfp != NULL) {
1853       fprintf (cfp->logfp, "TRN %s\n", cfp->buf);
1854       fflush (cfp->logfp);
1855     }
1856   }
1857   if (cdbefore.oldbiomol) {
1858     if (cfp->logfp != NULL) {
1859       fprintf (cfp->logfp, "MOL %s\n", cfp->buf);
1860       fflush (cfp->logfp);
1861     }
1862   }
1863   if (cdbefore.badOrg) {
1864     if (cfp->logfp != NULL) {
1865       fprintf (cfp->logfp, "ORG %s\n", cfp->buf);
1866       fflush (cfp->logfp);
1867     }
1868   }
1869   if (cdbefore.rpt_unit_seq) {
1870     if (cfp->logfp != NULL) {
1871       fprintf (cfp->logfp, "RUS %s\n", cfp->buf);
1872       fflush (cfp->logfp);
1873     }
1874   }
1875 
1876   if (cdbefore.badDbxref) {
1877     if (cfp->logfp != NULL) {
1878       fprintf (cfp->logfp, "BDX %s\n", cfp->buf);
1879       fflush (cfp->logfp);
1880     }
1881   }
1882   if (cdbefore.refDbxref) {
1883     if (cfp->logfp != NULL) {
1884       fprintf (cfp->logfp, "FDX %s\n", cfp->buf);
1885       fflush (cfp->logfp);
1886     }
1887   }
1888   if (cdbefore.srcDbxref) {
1889     if (cfp->logfp != NULL) {
1890       fprintf (cfp->logfp, "SDX %s\n", cfp->buf);
1891       fflush (cfp->logfp);
1892     }
1893   }
1894   if (cdbefore.capDbxref) {
1895     if (cfp->logfp != NULL) {
1896       fprintf (cfp->logfp, "CDX %s\n", cfp->buf);
1897       fflush (cfp->logfp);
1898     }
1899   }
1900   if (cdbefore.privDbxref) {
1901     if (cfp->logfp != NULL) {
1902       fprintf (cfp->logfp, "PDX %s\n", cfp->buf);
1903       fflush (cfp->logfp);
1904     }
1905   }
1906   if (cdbefore.oldDbxref) {
1907     if (cfp->logfp != NULL) {
1908       fprintf (cfp->logfp, "ODX %s\n", cfp->buf);
1909       fflush (cfp->logfp);
1910     }
1911   }
1912   if (cdbefore.multDbxref) {
1913     if (cfp->logfp != NULL) {
1914       fprintf (cfp->logfp, "MDX %s\n", cfp->buf);
1915       fflush (cfp->logfp);
1916     }
1917   }
1918   if (cdbefore.rareDbxref) {
1919     if (cfp->logfp != NULL) {
1920       fprintf (cfp->logfp, "RDX %s\n", cfp->buf);
1921       fflush (cfp->logfp);
1922     }
1923   }
1924   if (cdafter.hasUnpublished && ! cdafter.hasPublished) {
1925     if (cfp->logfp != NULL) {
1926       fprintf (cfp->logfp, "UNP %s\n", cfp->buf);
1927       fflush (cfp->logfp);
1928     }
1929   }
1930 
1931   if (sort) {
1932     if (cfp->logfp != NULL) {
1933       fprintf (cfp->logfp, "SRT %s\n", cfp->buf);
1934       fflush (cfp->logfp);
1935     }
1936   }
1937   if (sloc) {
1938     if (cfp->logfp != NULL) {
1939       fprintf (cfp->logfp, "SLC %s\n", cfp->buf);
1940       fflush (cfp->logfp);
1941     }
1942   }
1943   if (clnr) {
1944     if (cfp->logfp != NULL) {
1945       fprintf (cfp->logfp, "RCN %s\n", cfp->buf);
1946       fflush (cfp->logfp);
1947     }
1948   }
1949   if (othr) {
1950     if (cfp->logfp != NULL) {
1951       fprintf (cfp->logfp, "RNO %s\n", cfp->buf);
1952       fflush (cfp->logfp);
1953     }
1954   }
1955   if (modr) {
1956     if (cfp->logfp != NULL) {
1957       fprintf (cfp->logfp, "RMD %s\n", cfp->buf);
1958       fflush (cfp->logfp);
1959     }
1960   }
1961   if (publ) {
1962     if (cfp->logfp != NULL) {
1963       fprintf (cfp->logfp, "PUB %s\n", cfp->buf);
1964       fflush (cfp->logfp);
1965     }
1966   }
1967   if (auth) {
1968     if (cfp->logfp != NULL) {
1969       fprintf (cfp->logfp, "ATH %s\n", cfp->buf);
1970       fflush (cfp->logfp);
1971     }
1972   }
1973   if (pack) {
1974     if (cfp->logfp != NULL) {
1975       fprintf (cfp->logfp, "PKG %s\n", cfp->buf);
1976       fflush (cfp->logfp);
1977     }
1978   }
1979   if (move) {
1980     if (cfp->logfp != NULL) {
1981       fprintf (cfp->logfp, "MVP %s\n", cfp->buf);
1982       fflush (cfp->logfp);
1983     }
1984   }
1985   if (titl) {
1986     if (cfp->logfp != NULL) {
1987       fprintf (cfp->logfp, "TTL %s\n", cfp->buf);
1988       fflush (cfp->logfp);
1989     }
1990   }
1991 
1992   if (cdbefore.protdesc != cdafter.protdesc) {
1993     if (cfp->logfp != NULL) {
1994       fprintf (cfp->logfp, "PRT %s\n", cfp->buf);
1995       fflush (cfp->logfp);
1996     }
1997   }
1998   if (cdbefore.sfpnote != cdafter.sfpnote) {
1999     if (cfp->logfp != NULL) {
2000       fprintf (cfp->logfp, "COM %s\n", cfp->buf);
2001       fflush (cfp->logfp);
2002     }
2003   }
2004   if (cdbefore.gbsource != cdafter.gbsource) {
2005     if (cfp->logfp != NULL) {
2006       fprintf (cfp->logfp, "SRC %s\n", cfp->buf);
2007       fflush (cfp->logfp);
2008     }
2009   }
2010   if (cdbefore.cdsconf != cdafter.cdsconf) {
2011     if (cfp->logfp != NULL) {
2012       fprintf (cfp->logfp, "CNF %s\n", cfp->buf);
2013       fflush (cfp->logfp);
2014     }
2015   }
2016 
2017   if (ncbiusrobj) {
2018     if (cfp->logfp != NULL) {
2019       fprintf (cfp->logfp, "USR %s\n", cfp->buf);
2020       fflush (cfp->logfp);
2021     }
2022   }
2023 }
2024 
2025 static CharPtr ffmod [] = {
2026   "",
2027   "release",
2028   "entrez",
2029   "gbench",
2030   "dump",
2031   NULL
2032 };
2033 
2034 static void DoGBFFReport (
2035   SeqEntryPtr sep,
2036   CleanFlagPtr cfp,
2037   Int2 batch
2038 )
2039 
2040 {
2041 #ifdef OS_UNIX
2042   AsnIoPtr     aip;
2043   Char         arguments [128];
2044   BioseqPtr    bsp;
2045   Char         ch;
2046   Char         cmmd [512];
2047   int          diff;
2048   FileCache    fc;
2049   FILE         *fp;
2050   SeqEntryPtr  fsep;
2051   Char         line [512];
2052   FILE         *ofp;
2053   Char         path1 [PATH_MAX];
2054   Char         path2 [PATH_MAX];
2055   Char         path3 [PATH_MAX];
2056   CharPtr      rep = "reports";
2057   SeqIdPtr     sip;
2058   CharPtr      str;
2059 
2060   if (sep == NULL || cfp == NULL) return;
2061 
2062   fsep = FindNthBioseq (sep, 1);
2063   if (fsep != NULL && fsep->choice == 1) {
2064     bsp = (BioseqPtr) fsep->data.ptrvalue;
2065     if (bsp != NULL) {
2066       for (sip = bsp->id; sip != NULL; sip = sip->next) {
2067         switch (sip->choice) {
2068           case SEQID_GENBANK :
2069             rep = "gbreports";
2070             break;
2071           case SEQID_EMBL :
2072             rep = "ebreports";
2073             break;
2074           case SEQID_DDBJ :
2075             rep = "djreports";
2076             break;
2077           case SEQID_OTHER :
2078             rep = "rfreports";
2079             break;
2080           default :
2081             break;
2082         }
2083       }
2084     }
2085   }
2086 
2087   if (cfp->logfp != NULL) {
2088     fprintf (cfp->logfp, "%s\n", cfp->buf);
2089     fflush (cfp->logfp);
2090   }
2091 
2092   if (batch == 1) {
2093 
2094     TmpNam (path1);
2095     TmpNam (path2);
2096 
2097     fp = FileOpen (path1, "w");
2098     if (fp != NULL) {
2099       SeqEntryToGnbk (sep, NULL, GENBANK_FMT, cfp->ffmode, NORMAL_STYLE, 0, 0, 0, NULL, fp);
2100     }
2101     FileClose (fp);
2102     SeriousSeqEntryCleanupBulk (sep);
2103     fp = FileOpen (path2, "w");
2104     if (fp != NULL) {
2105       SeqEntryToGnbk (sep, NULL, GENBANK_FMT, cfp->ffmode, NORMAL_STYLE, 0, 0, 0, NULL, fp);
2106     }
2107     FileClose (fp);
2108 
2109     sprintf (cmmd, "%s -o %s -n %s -d %s", cfp->ffdiff, path1, path2, rep);
2110     system (cmmd);
2111 
2112     sprintf (cmmd, "rm %s; rm %s", path1, path2);
2113     system (cmmd);
2114 
2115   } else if (batch == 2) {
2116 
2117     TmpNam (path1);
2118     TmpNam (path2);
2119     TmpNam (path3);
2120 
2121     aip = AsnIoOpen (path3, "w");
2122     if (aip == NULL) return;
2123 
2124     SeqEntryAsnWrite (sep, aip, NULL);
2125     AsnIoClose (aip);
2126 
2127     fp = FileOpen (path1, "w");
2128     if (fp != NULL) {
2129       SeqEntryToGnbk (sep, NULL, GENBANK_FMT, cfp->ffmode, NORMAL_STYLE, 0, 0, 0, NULL, fp);
2130     }
2131     FileClose (fp);
2132 
2133     arguments [0] = '\0';
2134     sprintf (arguments,
2135              "-format genbank -mode %s -style normal -view nuc -nocleanup",
2136              ffmod [(int) cfp->ffmode]);
2137 
2138     sprintf (cmmd, "%s %s -i %s -o %s", cfp->asn2flat, arguments, path3, path2);
2139     system (cmmd);
2140 
2141     sprintf (cmmd, "diff -h %s %s > %s", path1, path2, path3);
2142     diff = system (cmmd);
2143 
2144     if (diff > 0) {
2145       fp = FileOpen (path3, "r");
2146       ofp = FileOpen (rep, "a");
2147       if (fp != NULL && ofp != NULL) {
2148         fprintf (ofp, "\n\n%s\n", cfp->buf);
2149         fflush (ofp);
2150         if (FileCacheSetup (&fc, fp)) {
2151           str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
2152           while (str != NULL) {
2153             ch = line [0];
2154             if (ch == '<' || ch == '>' || ch == '-') {
2155               fprintf (ofp, "%s\n", line);
2156             } else if (IS_DIGIT (ch)) {
2157               fprintf (ofp, "\n%s\n", "===");
2158             }
2159             str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
2160           }
2161         }
2162       }
2163       FileClose (ofp);
2164       FileClose (fp);
2165     }
2166 
2167     sprintf (cmmd, "rm %s; rm %s; rm %s", path1, path2, path3);
2168     system (cmmd);
2169   }
2170 #endif
2171 }
2172 
2173 static void DoValidatorReport (
2174   SeqEntryPtr sep,
2175   FILE *logfp,
2176   CharPtr id,
2177   CharPtr asnval
2178 )
2179 
2180 {
2181 #ifdef OS_UNIX
2182   AsnIoPtr   aip;
2183   Char       ch;
2184   Char       cmmd [512];
2185   int        diff;
2186   FileCache  fc;
2187   FILE       *fp;
2188   Char       line [512];
2189   Char       path1 [PATH_MAX];
2190   Char       path2 [PATH_MAX];
2191   Char       path3 [PATH_MAX];
2192   Char       path4 [PATH_MAX];
2193   Char       path5 [PATH_MAX];
2194   CharPtr    str;
2195 
2196   if (sep == NULL || logfp == NULL) return;
2197   if (StringHasNoText (id) || StringHasNoText (asnval)) return;
2198 
2199   TmpNam (path1);
2200   TmpNam (path2);
2201   TmpNam (path3);
2202   TmpNam (path4);
2203   TmpNam (path5);
2204 
2205   RemoveAllNcbiCleanupUserObjects (sep);
2206 
2207   aip = AsnIoOpen (path3, "w");
2208   if (aip == NULL) return;
2209 
2210   SeqEntryAsnWrite (sep, aip, NULL);
2211   AsnIoClose (aip);
2212 
2213   SeriousSeqEntryCleanup (sep, NULL, NULL);
2214   RemoveAllNcbiCleanupUserObjects (sep);
2215 
2216   aip = AsnIoOpen (path4, "w");
2217   if (aip == NULL) return;
2218 
2219   SeqEntryAsnWrite (sep, aip, NULL);
2220   AsnIoClose (aip);
2221 
2222   sprintf (cmmd, "%s -i %s -o stdout -Q 1 -r -l | sort > %s", asnval, path3, path1);
2223   system (cmmd);
2224 
2225   sprintf (cmmd, "%s -i %s -o stdout -Q 1 -r -l | sort > %s", asnval, path4, path2);
2226   system (cmmd);
2227 
2228   sprintf (cmmd, "diff -h %s %s > %s", path1, path2, path5);
2229   diff = system (cmmd);
2230 
2231   if (diff > 0) {
2232     fp = FileOpen (path5, "r");
2233     if (fp != NULL) {
2234       fprintf (logfp, "\n\n%s\n", id);
2235       fflush (logfp);
2236       if (FileCacheSetup (&fc, fp)) {
2237         str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
2238         while (str != NULL) {
2239           ch = line [0];
2240           if (ch == '<' || ch == '>' || ch == '-') {
2241             fprintf (logfp, "%s\n", line);
2242           } else if (IS_DIGIT (ch)) {
2243             fprintf (logfp, "\n%s\n", "===");
2244           }
2245           str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
2246         }
2247       }
2248       fflush (logfp);
2249     }
2250     FileClose (fp);
2251   }
2252 
2253   sprintf (cmmd, "rm %s; rm %s; rm %s; rm %s; rm %s", path1, path2, path3, path4, path5);
2254   system (cmmd);
2255 #endif
2256 }
2257 
2258 static void DoModernizeReport (
2259   SeqEntryPtr sep,
2260   CleanFlagPtr cfp
2261 )
2262 
2263 {
2264   ByteStorePtr  bs = NULL, tmp = NULL;
2265 
2266   bs = Se2Bs (sep);
2267 
2268   VisitFeaturesInSep (sep, NULL, ModGenes);
2269   tmp = Se2Bs (sep);
2270   if (! BSEqual (bs, tmp)) {
2271     if (cfp->logfp != NULL) {
2272       fprintf (cfp->logfp, "GEN %s\n", cfp->buf);
2273       fflush (cfp->logfp);
2274     }
2275   }
2276   BSFree (bs);
2277   bs = tmp;
2278 
2279   VisitFeaturesInSep (sep, NULL, ModRNAs);
2280   tmp = Se2Bs (sep);
2281   if (! BSEqual (bs, tmp)) {
2282     if (cfp->logfp != NULL) {
2283       fprintf (cfp->logfp, "NCR %s\n", cfp->buf);
2284       fflush (cfp->logfp);
2285     }
2286   }
2287   BSFree (bs);
2288   bs = tmp;
2289 
2290   VisitBioSourcesInSep (sep, NULL, ModPCRs);
2291   tmp = Se2Bs (sep);
2292   if (! BSEqual (bs, tmp)) {
2293     if (cfp->logfp != NULL) {
2294       fprintf (cfp->logfp, "PCR %s\n", cfp->buf);
2295       fflush (cfp->logfp);
2296     }
2297   }
2298   BSFree (bs);
2299   bs = tmp;
2300 
2301   BSFree (bs);
2302 }
2303 
2304 static CharPtr stopWords [] = {
2305   "a",
2306   "about",
2307   "again",
2308   "all",
2309   "almost",
2310   "also",
2311   "although",
2312   "always",
2313   "among",
2314   "an",
2315   "and",
2316   "another",
2317   "any",
2318   "are",
2319   "as",
2320   "at",
2321   "be",
2322   "because",
2323   "been",
2324   "before",
2325   "being",
2326   "between",
2327   "both",
2328   "but",
2329   "by",
2330   "can",
2331   "could",
2332   "did",
2333   "do",
2334   "does",
2335   "done",
2336   "due",
2337   "during",
2338   "each",
2339   "either",
2340   "enough",
2341   "especially",
2342   "etc",
2343   "for",
2344   "found",
2345   "from",
2346   "further",
2347   "had",
2348   "has",
2349   "have",
2350   "having",
2351   "here",
2352   "how",
2353   "however",
2354   "i",
2355   "if",
2356   "in",
2357   "into",
2358   "is",
2359   "it",
2360   "its",
2361   "itself",
2362   "just",
2363   "kg",
2364   "km",
2365   "made",
2366   "mainly",
2367   "make",
2368   "may",
2369   "mg",
2370   "might",
2371   "ml",
2372   "mm",
2373   "most",
2374   "mostly",
2375   "must",
2376   "nearly",
2377   "neither",
2378   "no",
2379   "nor",
2380   "obtained",
2381   "of",
2382   "often",
2383   "on",
2384   "our",
2385   "overall",
2386   "perhaps",
2387   "pmid",
2388   "quite",
2389   "rather",
2390   "really",
2391   "regarding",
2392   "seem",
2393   "seen",
2394   "several",
2395   "should",
2396   "show",
2397   "showed",
2398   "shown",
2399   "shows",
2400   "significantly",
2401   "since",
2402   "so",
2403   "some",
2404   "such",
2405   "than",
2406   "that",
2407   "the",
2408   "their",
2409   "theirs",
2410   "them",
2411   "then",
2412   "there",
2413   "therefore",
2414   "these",
2415   "they",
2416   "this",
2417   "those",
2418   "through",
2419   "thus",
2420   "to",
2421   "upon",
2422   "use",
2423   "used",
2424   "using",
2425   "various",
2426   "very",
2427   "was",
2428   "we",
2429   "were",
2430   "what",
2431   "when",
2432   "which",
2433   "while",
2434   "with",
2435   "within",
2436   "without",
2437   "would",
2438   NULL
2439 };
2440 
2441 static Boolean IsStopWord (
2442   CharPtr str
2443 )
2444 
2445 {
2446   Int2  i;
2447 
2448   if (StringHasNoText (str)) return FALSE;
2449 
2450   for (i = 0; stopWords [i] != NULL; i++) {
2451     if (StringICmp (str, stopWords [i]) == 0) return TRUE;
2452   }
2453 
2454   return FALSE;
2455 }
2456 
2457 static ValNodePtr GetAuthorMLNameList (
2458   AuthListPtr alp
2459 )
2460 
2461 {
2462   AuthorPtr    ap;
2463   Char         buf [128];
2464   Char         ch;
2465   Char         chr [4];
2466   ValNodePtr   head = NULL;
2467   Char         initials [32];
2468   ValNodePtr   last = NULL;
2469   NameStdPtr   nsp;
2470   PersonIdPtr  pid;
2471   CharPtr      ptr;
2472   CharPtr      str;
2473   ValNodePtr   tmp;
2474   ValNodePtr   vnp;
2475 
2476   if (alp == NULL) return NULL;
2477 
2478   for (vnp = alp->names; vnp != NULL; vnp = vnp->next) {
2479     buf [0] = '\0';
2480     initials [0] = '\0';
2481     switch (alp->choice) {
2482       case 1 :
2483         ap = (AuthorPtr) vnp->data.ptrvalue;
2484         if (ap == NULL) continue;
2485         pid = ap->name;
2486         if (pid == NULL) continue;
2487         if (pid->choice == 2) {
2488           nsp = pid->data;
2489           if (nsp == NULL) continue;
2490           str = nsp->names [0];
2491           if (StringHasNoText (str)) continue;
2492           StringNCpy_0 (buf, str, sizeof (buf));
2493           StringNCpy_0 (initials, nsp->names [4], sizeof (initials));
2494         }
2495         break;
2496       case 2 :
2497       case 3 :
2498         str = (CharPtr) vnp->data.ptrvalue;
2499         if (StringHasNoText (str)) continue;
2500         StringNCpy_0 (buf, str, sizeof (buf));
2501         ptr = StringChr (buf, ',');
2502         if (ptr == NULL) {
2503           ptr = StringChr (buf, ' ');
2504         }
2505         if (ptr != NULL) {
2506           *ptr = '\0';
2507           ptr++;
2508           StringNCpy_0 (initials, ptr, sizeof (initials));
2509         }
2510         break;
2511       default :
2512         break;
2513     }
2514     if (StringHasNoText (buf)) continue;
2515     if (StringDoesHaveText (initials)) {
2516       StringCat (buf, " ");
2517       chr [1] = '\0';
2518       ptr = initials;
2519       ch = *ptr;
2520       while (ch != '\0') {
2521         if (ch != ' ' && ch != '.' && ch != ',') {
2522           chr [0] = ch;
2523           StringCat (buf, chr);
2524         }
2525         ptr++;
2526         ch = *ptr;
2527       }
2528     }
2529     TrimSpacesAroundString (buf);
2530     tmp = ValNodeCopyStr (&last, 0, buf);
2531     if (head == NULL) {
2532       head = tmp;
2533     }
2534     last = tmp;
2535   }
2536 
2537   return head;
2538 }
2539 
2540 static ValNodePtr GetTitleWords (
2541   CharPtr title
2542 )
2543 
2544 {
2545   Char        ch;
2546   Boolean     goOn = TRUE;
2547   ValNodePtr  head = NULL;
2548   ValNodePtr  last = NULL;
2549   CharPtr     ptr;
2550   CharPtr     str;
2551   CharPtr     tmp;
2552   ValNodePtr  vnp;
2553 
2554   if (StringHasNoText (title)) return NULL;
2555 
2556   tmp = StringSave (title);
2557   if (tmp == NULL) return NULL;
2558 
2559   ptr = tmp;
2560   ch = *ptr;
2561   if (ch == '\0') {
2562     goOn = FALSE;
2563   }
2564   while (goOn) {
2565     while (ch != '\0' && (! IS_ALPHANUM (ch))) {
2566       ptr++;
2567       ch = *ptr;
2568     }
2569     str = ptr;
2570     while (ch != '\0' && IS_ALPHANUM (ch)) {
2571       ptr++;
2572       ch = *ptr;
2573     }
2574     if (ch == '\0') {
2575       goOn = FALSE;
2576     }
2577     *ptr = '\0';
2578     ptr++;
2579     ch = *ptr;
2580     TrimSpacesAroundString (str);
2581     /*
2582     if (! IsStopWord (str)) {
2583       vnp = ValNodeCopyStr (&last, 0, str);
2584       if (head == NULL) {
2585         head = vnp;
2586       }
2587       last = vnp;
2588     }
2589     */
2590     vnp = ValNodeCopyStr (&last, 0, str);
2591     if (head == NULL) {
2592       head = vnp;
2593     }
2594     last = vnp;
2595   }
2596 
2597   MemFree (tmp);
2598 
2599   return head;
2600 }
2601 
2602 static ValNodePtr DuplicateStringList (
2603   ValNodePtr list
2604 )
2605 
2606 {
2607   ValNodePtr  head = NULL;
2608   ValNodePtr  last = NULL;
2609   CharPtr     str;
2610   ValNodePtr  tmp;
2611   ValNodePtr  vnp;
2612 
2613   if (list == NULL) return NULL;
2614 
2615   for (vnp = list; vnp != NULL; vnp = vnp->next) {
2616     str = (CharPtr) vnp->data.ptrvalue;
2617     if (StringHasNoText (str)) continue;
2618     tmp = ValNodeCopyStr (&last, 0, str);
2619     if (head == NULL) {
2620       head = tmp;
2621     }
2622     last = tmp;
2623   }
2624 
2625   return head;
2626 }
2627 
2628 typedef enum {
2629   FULL_INITIALS,
2630   TWO_INITIALS,
2631   ONE_INITIAL,
2632   NO_INITIALS
2633 } InitialsPolicy;
2634 
2635 static void TrimInitials (
2636   CharPtr auth,
2637   InitialsPolicy initials
2638 )
2639 
2640 {
2641   Char     ch;
2642   CharPtr  ptr;
2643 
2644   if (StringHasNoText (auth)) return;
2645 
2646   switch (initials) {
2647     case FULL_INITIALS :
2648       break;
2649     case TWO_INITIALS :
2650       ptr = StringRChr (auth, ' ');
2651       if (ptr != NULL) {
2652         ptr++;
2653         ch = *ptr;
2654         if (IS_ALPHANUM (ch)) {
2655           ptr++;
2656           ch = *ptr;
2657           if (IS_ALPHANUM (ch)) {
2658             ptr++;
2659             *ptr = '\0';
2660           }
2661         }
2662       }
2663       break;
2664     case ONE_INITIAL :
2665       ptr = StringRChr (auth, ' ');
2666       if (ptr != NULL) {
2667         ptr++;
2668         ch = *ptr;
2669         if (IS_ALPHANUM (ch)) {
2670           ptr++;
2671           *ptr = '\0';
2672         }
2673       }
2674       break;
2675     case NO_INITIALS :
2676       ptr = StringRChr (auth, ' ');
2677       if (ptr != NULL) {
2678         *ptr = '\0';
2679       }
2680       break;
2681     default :
2682       break;
2683   }
2684 }
2685 
2686 static Int4 DoUnpubBooleanQuery (
2687   ValNodePtr authors,
2688   InitialsPolicy initials,
2689   Boolean firstLastOnly,
2690   ValNodePtr titlewords,
2691   Int2 year,
2692   Boolean expand,
2693   Uint4Ptr uidp
2694 )
2695 
2696 {
2697   Boolean                 addOpAnd = FALSE;
2698   Char                    buf [128];
2699   Int4                    count = 0;
2700   Entrez2BooleanReplyPtr  e2br;
2701   Entrez2IdListPtr        e2id;
2702   Entrez2RequestPtr       e2rp = NULL;
2703   Entrez2ReplyPtr         e2ry;
2704   CharPtr                 str;
2705   ValNodePtr              vnp;
2706 
2707   if (uidp != NULL) {
2708     *uidp = 0;
2709   }
2710 
2711   e2rp = EntrezCreateBooleanRequest (TRUE, FALSE, "PubMed", NULL, 0, 0, NULL, 20, 0);
2712   if (e2rp == NULL) return 0;
2713 
2714   for (vnp = authors; vnp != NULL; vnp = vnp->next) {
2715     str = (CharPtr) vnp->data.ptrvalue;
2716     if (StringHasNoText (str)) continue;
2717     if (firstLastOnly) {
2718       if (vnp != authors && vnp->next != NULL) continue;
2719     }
2720     StringNCpy_0 (buf, str, sizeof (buf));
2721     switch (initials) {
2722       case TWO_INITIALS :
2723         TrimInitials (buf, TWO_INITIALS);
2724         break;
2725       case ONE_INITIAL :
2726         TrimInitials (buf, ONE_INITIAL);
2727         break;
2728       case NO_INITIALS :
2729         TrimInitials (buf, NO_INITIALS);
2730         break;
2731       default :
2732         break;
2733     }
2734     if (addOpAnd) {
2735       EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2736     }
2737     EntrezAddToBooleanRequest (e2rp, NULL, 0, "AUTH", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2738     addOpAnd = TRUE;
2739   }
2740 
2741   for (vnp = titlewords; vnp != NULL; vnp = vnp->next) {
2742     str = (CharPtr) vnp->data.ptrvalue;
2743     if (StringHasNoText (str)) continue;
2744     if (IsStopWord (str)) continue;
2745     StringNCpy_0 (buf, str, sizeof (buf));
2746     if (addOpAnd) {
2747       EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2748     }
2749     EntrezAddToBooleanRequest (e2rp, NULL, 0, "TITL", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2750     addOpAnd = TRUE;
2751   }
2752 
2753   if (year > 0) {
2754     if (addOpAnd) {
2755       EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2756     }
2757     if (expand) {
2758       EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2759       sprintf (buf, "%d", (int) year - 1);
2760       EntrezAddToBooleanRequest (e2rp, NULL, 0, "EDAT", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2761       EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2762       sprintf (buf, "%d", (int) year);
2763       EntrezAddToBooleanRequest (e2rp, NULL, 0, "EDAT", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2764       EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2765       sprintf (buf, "%d", (int) year + 1);
2766       EntrezAddToBooleanRequest (e2rp, NULL, 0, "EDAT", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2767       EntrezAddToBooleanRequest (e2rp, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2768     } else {
2769       sprintf (buf, "%d", (int) year);
2770       EntrezAddToBooleanRequest (e2rp, NULL, 0, "EDAT", buf, NULL, 0, 0, NULL, NULL, FALSE, FALSE);
2771     }
2772     addOpAnd = TRUE;
2773   }
2774 
2775   e2ry = EntrezSynchronousQuery (e2rp);
2776 
2777   e2rp = Entrez2RequestFree (e2rp);
2778   if (e2ry == NULL) return 0;
2779   e2br = EntrezExtractBooleanReply (e2ry);
2780   if (e2br == NULL) return 0;
2781 
2782   count = e2br->count;
2783 
2784   if (count > 0 && uidp != NULL) {
2785     e2id = e2br->uids;
2786     if (e2id != NULL && e2id->num == 1 && e2id->uids != NULL) {
2787       BSSeek (e2id->uids, 0, SEEK_SET);
2788       *uidp = Nlm_BSGetUint4 (e2id->uids);
2789     }
2790   }
2791 
2792   Entrez2BooleanReplyFree (e2br);
2793 
2794   return count;
2795 }
2796 
2797 static CharPtr GetBestJournal (
2798   ValNodePtr journaltitle
2799 )
2800 
2801 {
2802   CharPtr     str;
2803   ValNodePtr  vnp;
2804 
2805   if (journaltitle == NULL) return NULL;
2806 
2807   for (vnp = journaltitle; vnp != NULL; vnp = vnp->next) {
2808     if (vnp->choice == Cit_title_iso_jta) {
2809       str = (CharPtr) vnp->data.ptrvalue;
2810       if (StringHasNoText (str)) continue;
2811       return str;
2812     }
2813   }
2814 
2815   for (vnp = journaltitle; vnp != NULL; vnp = vnp->next) {
2816     if (vnp->choice == Cit_title_name || vnp->choice == Cit_title_jta) {
2817       str = (CharPtr) vnp->data.ptrvalue;
2818       if (StringHasNoText (str)) continue;
2819       return str;
2820     }
2821   }
2822 
2823   return NULL;
2824 }
2825 
2826 typedef struct pubref {
2827   ValNodePtr  authors;
2828   ValNodePtr  titlewords;
2829   CharPtr     fulltitle;
2830   CharPtr     uniquestr;
2831   CharPtr     journal;
2832   ImprintPtr  imp;
2833   Int2        year;
2834   Uint4       pmid;
2835 } PubRef, PNTR PubRefPtr;
2836 
2837 static void PrintPubAuthors (
2838   CleanFlagPtr cfp,
2839   PubRefPtr prp
2840 )
2841 
2842 {
2843   CharPtr     prefix = "";
2844   CharPtr     str;
2845   ValNodePtr  vnp;
2846 
2847   if (cfp == NULL || cfp->logfp == NULL || prp == NULL) return;
2848 
2849   for (vnp = prp->authors; vnp != NULL; vnp = vnp->next) {
2850     str = (CharPtr) vnp->data.ptrvalue;
2851     if (StringHasNoText (str)) continue;
2852     fprintf (cfp->logfp, "%s%s", prefix, str);
2853     prefix = ", ";
2854   }
2855 }
2856 
2857 static void PrintPubTitle (
2858   CleanFlagPtr cfp,
2859   PubRefPtr prp
2860 )
2861 
2862 {
2863   CharPtr     prefix = "";
2864   CharPtr     str;
2865   ValNodePtr  vnp;
2866 
2867   if (cfp == NULL || cfp->logfp == NULL || prp == NULL) return;
2868 
2869   if (StringDoesHaveText (prp->fulltitle)) {
2870     fprintf (cfp->logfp, "%s%s", prefix, prp->fulltitle);
2871   } else {
2872     for (vnp = prp->titlewords; vnp != NULL; vnp = vnp->next) {
2873       str = (CharPtr) vnp->data.ptrvalue;
2874       if (StringHasNoText (str)) continue;
2875       fprintf (cfp->logfp, "%s%s", prefix, str);
2876       prefix = " ";
2877     }
2878   }
2879 }
2880 
2881 static void PrintPubJournal (
2882   CleanFlagPtr cfp,
2883   PubRefPtr prp
2884 )
2885 
2886 {
2887   DatePtr     dp = NULL;
2888   ImprintPtr  imp;
2889   CharPtr     prefix = "";
2890   Int2        year;
2891 
2892   if (cfp == NULL || cfp->logfp == NULL || prp == NULL) return;
2893 
2894   if (StringHasNoText (prp->journal) && prp->imp == NULL) {
2895     fprintf (cfp->logfp, "Unpublished");
2896     prefix = " ";
2897     if (prp->year > 0) {
2898       fprintf (cfp->logfp, "%s[%d]", prefix, (int) prp->year);
2899       prefix = " ";
2900     }
2901     return;
2902   }
2903 
2904   if (StringDoesHaveText (prp->journal)) {
2905     fprintf (cfp->logfp, "%s%s", prefix, prp->journal);
2906     prefix = " ";
2907   }
2908 
2909   imp = prp->imp;
2910   if (imp != NULL) {
2911     dp = imp->date;
2912     if (dp != NULL && dp->data [0] == 1) {
2913       year = (Int2) dp->data [1] + 1900;
2914       fprintf (cfp->logfp, "%s[%d]", prefix, (int) year);
2915       prefix = " ";
2916     }
2917     if (StringDoesHaveText (imp->volume)) {
2918       fprintf (cfp->logfp, "%s%s", prefix, imp->volume);
2919       prefix = " ";
2920     }
2921     /*
2922     if (StringDoesHaveText (imp->issue)) {
2923       fprintf (cfp->logfp, "%s(%s)", prefix, imp->issue);
2924       prefix = " ";
2925     }
2926     */
2927     if (StringDoesHaveText (imp->pages)) {
2928       fprintf (cfp->logfp, "%s: %s", prefix, imp->pages);
2929       prefix = " ";
2930     }
2931   }
2932 
2933   if (prp->pmid > 0) {
2934     fprintf (cfp->logfp, "%s<%ld>", prefix, (long) prp->pmid);
2935     prefix = " ";
2936   }
2937 }
2938 
2939 /*
2940 static void PrintAuthTitle (
2941   CleanFlagPtr cfp,
2942   CharPtr label,
2943   Boolean multiline,
2944   PubRefPtr prp
2945 )
2946 
2947 {
2948   DatePtr     dp = NULL;
2949   ImprintPtr  imp;
2950   CharPtr     prefix;
2951   CharPtr     separator = " ";
2952   CharPtr     str;
2953   ValNodePtr  vnp;
2954   Int2        year;
2955 
2956   if (cfp == NULL || cfp->logfp == NULL || prp == NULL) return;
2957 
2958   if (multiline) {
2959     separator = "\n";
2960   }
2961 
2962   if (StringDoesHaveText (label)) {
2963     fprintf (cfp->logfp, "%s", label);
2964     fprintf (cfp->logfp, separator);
2965   }
2966 
2967   prefix = "Author:  ";
2968   for (vnp = prp->authors; vnp != NULL; vnp = vnp->next) {
2969     str = (CharPtr) vnp->data.ptrvalue;
2970     if (StringHasNoText (str)) continue;
2971     fprintf (cfp->logfp, "%s%s", prefix, str);
2972     prefix = ", ";
2973   }
2974   fprintf (cfp->logfp, separator);
2975   prefix = "Title:   ";
2976   if (StringDoesHaveText (prp->fulltitle)) {
2977     fprintf (cfp->logfp, "%s%s", prefix, prp->fulltitle);
2978   } else {
2979     for (vnp = prp->titlewords; vnp != NULL; vnp = vnp->next) {
2980       str = (CharPtr) vnp->data.ptrvalue;
2981       if (StringHasNoText (str)) continue;
2982       fprintf (cfp->logfp, "%s%s", prefix, str);
2983       prefix = " ";
2984     }
2985   }
2986   fprintf (cfp->logfp, separator);
2987   year = prp->year;
2988   if (year > 0) {
2989     fprintf (cfp->logfp, "Year:    %d", (int) year);
2990     fprintf (cfp->logfp, separator);
2991   }
2992 
2993   if (StringDoesHaveText (prp->journal)) {
2994     fprintf (cfp->logfp, "Journal: %s", prp->journal);
2995     imp = prp->imp;
2996     if (imp != NULL) {
2997       dp = imp->date;
2998       if (dp != NULL && dp->data [0] == 1) {
2999         year = (Int2) dp->data [1] + 1900;
3000         fprintf (cfp->logfp, ". [%d]", (int) year);
3001       }
3002       if (StringDoesHaveText (imp->volume)) {
3003         fprintf (cfp->logfp, ". %s", imp->volume);
3004       }
3005       if (StringDoesHaveText (imp->issue)) {
3006         fprintf (cfp->logfp, " (%s)", imp->issue);
3007       }
3008       if (StringDoesHaveText (imp->pages)) {
3009         fprintf (cfp->logfp, " : %s", imp->pages);
3010       }
3011     }
3012     fprintf (cfp->logfp, separator);
3013   }
3014 
3015   fprintf (cfp->logfp, "\n");
3016   fflush (cfp->logfp);
3017 }
3018 */
3019 
3020 typedef enum {
3021   NO_NAME_MATCH,
3022   LAST_NAME_MATCH,
3023   ONE_INIT_MATCH,
3024   TWO_INIT_MATCH,
3025   FULL_NAME_MATCH
3026 } AuthComp;
3027 
3028 static CharPtr authlabel [] = {
3029   "AUTH_MISMATCH", "LAST_NAMES", "ONE_INIT", "TWO_INITS", "FULL_NAMES"
3030 };
3031 
3032 static AuthComp AuthorCompare (
3033   CharPtr auth1,
3034   CharPtr auth2
3035 )
3036 
3037 {
3038   Char  buf1 [128];
3039   Char  buf2 [128];
3040 
3041   if (StringHasNoText (auth1) || StringHasNoText (auth2)) return NO_NAME_MATCH;
3042 
3043   StringNCpy_0 (buf1, auth1, sizeof (buf1));
3044   StringNCpy_0 (buf2, auth2, sizeof (buf2));
3045 
3046   if (StringICmp (buf1, buf2) == 0) return FULL_NAME_MATCH;
3047 
3048   TrimInitials (buf1, TWO_INITIALS);
3049   TrimInitials (buf2, TWO_INITIALS);
3050 
3051   if (StringICmp (buf1, buf2) == 0) return TWO_INIT_MATCH;
3052 
3053   TrimInitials (buf1, ONE_INITIAL);
3054   TrimInitials (buf2, ONE_INITIAL);
3055 
3056   if (StringICmp (buf1, buf2) == 0) return ONE_INIT_MATCH;
3057 
3058   TrimInitials (buf1, NO_INITIALS);
3059   TrimInitials (buf2, NO_INITIALS);
3060 
3061   if (StringICmp (buf1, buf2) == 0) return LAST_NAME_MATCH;
3062 
3063   return NO_NAME_MATCH;
3064 }
3065 
3066 static AuthComp AuthorsIdentical (
3067   ValNodePtr oldauthors,
3068   ValNodePtr newauthors
3069 )
3070 
3071 {
3072   AuthComp    curr, rsult = FULL_NAME_MATCH;
3073   CharPtr     str1, str2;
3074   ValNodePtr  vnp1, vnp2;
3075 
3076   if (oldauthors == NULL || newauthors == NULL) return NO_NAME_MATCH;
3077 
3078   for (vnp1 = oldauthors, vnp2 = newauthors;
3079        vnp1 != NULL && vnp2 != NULL;
3080        vnp1 = vnp1->next, vnp2 = vnp2->next) {
3081     str1 = (CharPtr) vnp1->data.ptrvalue;
3082     str2 = (CharPtr) vnp2->data.ptrvalue;
3083     curr = AuthorCompare (str1, str2);
3084     if (curr == NO_NAME_MATCH) return NO_NAME_MATCH;
3085     if (curr < rsult) {
3086       rsult = curr;
3087     }
3088   }
3089 
3090   if (vnp1 != NULL || vnp2 != NULL) return NO_NAME_MATCH;
3091 
3092   return rsult;
3093 }
3094 
3095 static AuthComp AuthorInList (
3096   CharPtr author,
3097   ValNodePtr newauthors
3098 )
3099 
3100 {
3101   AuthComp    curr, rsult = FULL_NAME_MATCH;
3102   CharPtr     str;
3103   ValNodePtr  vnp;
3104 
3105   if (StringHasNoText (author) || newauthors == NULL) return NO_NAME_MATCH;
3106 
3107   for (vnp = newauthors; vnp != NULL; vnp = vnp->next) {
3108     str = (CharPtr) vnp->data.ptrvalue;
3109     curr = AuthorCompare (author, str);
3110     if (curr == NO_NAME_MATCH) continue;
3111     if (curr < rsult) {
3112       rsult = curr;
3113     }
3114   }
3115 
3116   return rsult;
3117 }
3118 
3119 static Boolean WordInList (
3120   CharPtr word,
3121   ValNodePtr newtitlewords
3122 )
3123 
3124 {
3125   CharPtr     str;
3126   ValNodePtr  vnp;
3127 
3128   if (StringHasNoText (word) || newtitlewords == NULL) return NO_NAME_MATCH;
3129 
3130   for (vnp = newtitlewords; vnp != NULL; vnp = vnp->next) {
3131     str = (CharPtr) vnp->data.ptrvalue;
3132     if (StringHasNoText (str)) continue;
3133     if (StringICmp (word, str) == 0) return TRUE;
3134   }
3135 
3136   return FALSE;
3137 }
3138 
3139 static void PrintComparison (
3140   CleanFlagPtr cfp,
3141   PubRefPtr oldprp,
3142   PubRefPtr newprp
3143 )
3144 
3145 {
3146   AuthComp    authcomp, curr, best = FULL_NAME_MATCH;
3147   Int2        matches, total;
3148   CharPtr     str, str1, str2;
3149   Boolean     titlsame;
3150   ValNodePtr  vnp;
3151 
3152   if (cfp == NULL || cfp->logfp == NULL || oldprp == NULL || newprp == NULL) return;
3153 
3154   authcomp = AuthorsIdentical (oldprp->authors, newprp->authors);
3155   titlsame = (Boolean) (StringICmp (oldprp->fulltitle, newprp->fulltitle) == 0);
3156 
3157   fprintf (cfp->logfp, "PMID %ld", (long) newprp->pmid);
3158   fprintf (cfp->logfp, "\t");
3159 
3160   fprintf (cfp->logfp, "%s", cfp->buf);
3161   fprintf (cfp->logfp, "\t");
3162 
3163   fprintf (cfp->logfp, "REF_COUNT %ld", (long) cfp->unpubcount);
3164   fprintf (cfp->logfp, "\t");
3165 
3166   fprintf (cfp->logfp, "ORIG_NAMES %ld", (long) ValNodeLen (oldprp->authors));
3167   fprintf (cfp->logfp, "\t");
3168 
3169   fprintf (cfp->logfp, "ADDL_NAMES %ld", (long) (ValNodeLen (newprp->authors) - ValNodeLen (oldprp->authors)));
3170   fprintf (cfp->logfp, "\t");
3171 
3172   fprintf (cfp->logfp, "ORIG_WORDS %ld", (long) ValNodeLen (oldprp->titlewords));
3173   fprintf (cfp->logfp, "\t");
3174 
3175   fprintf (cfp->logfp, "ADDL_WORDS %ld", (long) (ValNodeLen (newprp->titlewords) - ValNodeLen (oldprp->titlewords)));
3176   fprintf (cfp->logfp, "\t");
3177 
3178   if (StringDoesHaveText (oldprp->uniquestr)) {
3179     fprintf (cfp->logfp, "UNIQ_CIT %s", oldprp->uniquestr);
3180   } else {
3181     fprintf (cfp->logfp, "?");
3182   }
3183   fprintf (cfp->logfp, "\t");
3184 
3185   if (authcomp != NO_NAME_MATCH) {
3186     str = authlabel [(int) authcomp];
3187     fprintf (cfp->logfp, "AUTHORS_SAME [%s]", str);
3188     fprintf (cfp->logfp, "\t");
3189   } else {
3190     total = ValNodeLen (newprp->authors);
3191 
3192     matches = 0;
3193     for (vnp = oldprp->authors; vnp != NULL; vnp = vnp->next) {
3194       str = (CharPtr) vnp->data.ptrvalue;
3195       if (StringHasNoText (str)) continue;
3196       curr = AuthorInList (str, newprp->authors);
3197       if (curr == NO_NAME_MATCH) continue;
3198       matches++;
3199       if (curr < best) {
3200         best = curr;
3201       }
3202     }
3203 
3204     str = authlabel [(int) best];
3205     fprintf (cfp->logfp, "AUTHORS_DIFFER [%s] %d / %d", str, (int) matches, (int) total);
3206     fprintf (cfp->logfp, "\t");
3207   }
3208 
3209   if (titlsame) {
3210     fprintf (cfp->logfp, "TITLE_SAME");
3211   } else {
3212     total = 0;
3213     for (vnp = newprp->titlewords; vnp != NULL; vnp = vnp->next) {
3214       str = (CharPtr) vnp->data.ptrvalue;
3215       if (StringHasNoText (str)) continue;
3216       if (IsStopWord (str)) continue;
3217       total++;
3218     }
3219 
3220     matches = 0;
3221     for (vnp = oldprp->titlewords; vnp != NULL; vnp = vnp->next) {
3222       str = (CharPtr) vnp->data.ptrvalue;
3223       if (StringHasNoText (str)) continue;
3224       if (IsStopWord (str)) continue;
3225       if (! WordInList (str, newprp->titlewords)) continue;
3226       matches++;
3227     }
3228 
3229     str1 = NULL;
3230     str2 = NULL;
3231     vnp = oldprp->titlewords;
3232     if (vnp != NULL) {
3233       str1 = (CharPtr) vnp->data.ptrvalue;
3234     }
3235     vnp = newprp->titlewords;
3236     if (vnp != NULL) {
3237       str2 = (CharPtr) vnp->data.ptrvalue;
3238     }
3239     if (str1 != NULL && str2 != NULL && StringCmp (str1, str2) == 0 && total > 0 && matches == total) {
3240       fprintf (cfp->logfp, "TITLE_SIMILAR %d / %d", (int) matches, (int) total);
3241     } else if (total > 0 && matches == total) {
3242       fprintf (cfp->logfp, "TITLE_SUSPECT %d / %d", (int) matches, (int) total);
3243     } else {
3244       fprintf (cfp->logfp, "TITLE_DIFFERS %d / %d", (int) matches, (int) total);
3245     }
3246   }
3247   fprintf (cfp->logfp, "\t");
3248 
3249   PrintPubAuthors (cfp, oldprp);
3250   fprintf (cfp->logfp, "\t");
3251 
3252   PrintPubAuthors (cfp, newprp);
3253   fprintf (cfp->logfp, "\t");
3254 
3255   PrintPubTitle (cfp, oldprp);
3256   fprintf (cfp->logfp, "\t");
3257 
3258   PrintPubTitle (cfp, newprp);
3259   fprintf (cfp->logfp, "\t");
3260 
3261   PrintPubJournal (cfp, oldprp);
3262   fprintf (cfp->logfp, "\t");
3263 
3264   PrintPubJournal (cfp, newprp);
3265 
3266   fprintf (cfp->logfp, "\n");
3267 
3268   /*
3269   if (identical) {
3270     PrintAuthTitle (cfp, "EXACT:  ", FALSE, newprp);
3271   } else {
3272     PrintAuthTitle (cfp, "BEFORE: ", FALSE, oldprp);
3273     PrintAuthTitle (cfp, "AFTER:  ", FALSE, newprp);
3274   }
3275   */
3276 }
3277 
3278 static void StrStripSpaces (
3279   CharPtr str
3280 )
3281 
3282 {
3283   CharPtr  new_str;
3284 
3285   if (str == NULL) return;
3286 
3287   new_str = str;
3288   while (*str != '\0') {
3289     *new_str++ = *str;
3290     if (*str == ' ' || *str == '\t' || *str == '(') {
3291       for (str++; *str == ' ' || *str == '\t'; str++) continue;
3292       if (*str == ')' || *str == ',') {
3293         new_str--;
3294       }
3295     } else {
3296       str++;
3297     }
3298   }
3299   *new_str = '\0';
3300 }
3301 
3302 static void StrStripBrackets (
3303   CharPtr str
3304 )
3305 
3306 {
3307   size_t  len;
3308 
3309   if (str == NULL) return;
3310 
3311   len = StringLen (str);
3312   if (len < 2) return;
3313 
3314   if (str [0] == '[') {
3315     str [0] = ' ';
3316   }
3317 
3318   if (str [len - 1] == ']') {
3319     str [len - 1] = ' ';
3320   }
3321 }
3322 
3323 static void PrintPubMedCit (
3324   CleanFlagPtr cfp,
3325   Uint4 pmid,
3326   PubRefPtr oldprp
3327 )
3328 
3329 {
3330   CitArtPtr        cap;
3331   CitJourPtr       cjp;
3332   DatePtr          dp;
3333   ImprintPtr       imp;
3334   MedlineEntryPtr  mep;
3335   PubmedEntryPtr   pep;
3336   PubRef           pr;
3337   CharPtr          str;
3338   CharPtr          tmp;
3339   ValNodePtr       vnp;
3340 
3341   if (cfp == NULL || cfp->logfp == NULL || pmid < 1) return;
3342 
3343   MemSet ((Pointer) &pr, 0, sizeof (PubRef));
3344 
3345   pep = PubMedSynchronousQuery (pmid);
3346   if (pep == NULL) return;
3347 
3348   mep = (MedlineEntryPtr) pep->medent;
3349   if (mep != NULL && mep->cit != NULL) {
3350     cap = mep->cit;
3351     if (cap != NULL) {
3352       pr.authors = GetAuthorMLNameList (cap->authors);
3353       for (vnp = cap->title; vnp != NULL; vnp = vnp->next) {
3354         if (vnp->choice == Cit_title_name) {
3355           str = (CharPtr) vnp->data.ptrvalue;
3356           if (StringHasNoText (str)) continue;
3357           pr.titlewords = GetTitleWords (str);
3358           tmp = StringSave (str);
3359           TrimSpacesAndJunkFromEnds (tmp, TRUE);
3360           s_RemovePeriodFromEnd (tmp);
3361           StrStripBrackets (tmp);
3362           StrStripSpaces (tmp);
3363           pr.fulltitle = tmp;
3364         }
3365       }
3366       if (cap->from == 1) {
3367         cjp = (CitJourPtr) cap->fromptr;
3368         if (cjp != NULL) {
3369           pr.journal = GetBestJournal (cjp->title);
3370           imp = cjp->imp;
3371           pr.imp = imp;
3372           if (imp != NULL) {
3373             dp = imp->date;
3374             if (dp != NULL && dp->data [0] == 1) {
3375               pr.year = (Int2) dp->data [1] + 1900;
3376             }
3377           }
3378         }
3379       }
3380       pr.pmid = pmid;
3381       if (pr.authors != NULL && pr.titlewords != NULL) {
3382         /*
3383         PrintAuthTitle (cfp, "RESULT", TRUE, &pr);
3384         */
3385         PrintComparison (cfp, oldprp, &pr);
3386       }
3387       ValNodeFreeData (pr.authors);
3388       ValNodeFreeData (pr.titlewords);
3389       MemFree (pr.fulltitle);
3390     }
3391   }
3392 
3393   pep = PubmedEntryFree (pep);
3394 }
3395 
3396 static void TryEntrezQueries (
3397   CleanFlagPtr cfp,
3398   PubRefPtr prp
3399 )
3400 
3401 {
3402   Int4   count;
3403   Uint4  pmid = 0;
3404 
3405   if (cfp == NULL || cfp->logfp == NULL || prp == NULL || prp->authors == NULL) return;
3406 
3407   count = DoUnpubBooleanQuery (prp->authors, ONE_INITIAL, FALSE, prp->titlewords, prp->year, TRUE, &pmid);
3408 
3409   if (count > 1) {
3410     count = DoUnpubBooleanQuery (prp->authors, TWO_INITIALS, FALSE, prp->titlewords, prp->year, TRUE, &pmid);
3411   }
3412 
3413   if (count < 1) {
3414 
3415     fprintf (cfp->logfp, "0\t%s\tUNPUB\t", cfp->buf);
3416     PrintPubAuthors (cfp, prp);
3417     fprintf (cfp->logfp, "\t");
3418     PrintPubTitle (cfp, prp);
3419     fprintf (cfp->logfp, "\t");
3420     PrintPubJournal (cfp, prp);
3421     fprintf (cfp->logfp, "\n");
3422 
3423   } else if (count > 1) {
3424 
3425     fprintf (cfp->logfp, "0\t%s\tCOUNT %ld\t", cfp->buf, (long) count);
3426     PrintPubAuthors (cfp, prp);
3427     fprintf (cfp->logfp, "\t");
3428     PrintPubTitle (cfp, prp);
3429     fprintf (cfp->logfp, "\t");
3430     PrintPubJournal (cfp, prp);
3431     fprintf (cfp->logfp, "\n");
3432 
3433   } else {
3434 
3435     PrintPubMedCit (cfp, pmid, prp);
3436   }
3437 
3438   fflush (cfp->logfp);
3439 }
3440 
3441 static void CountUnpubPub (
3442   PubdescPtr pdp,
3443   Pointer userdata
3444 )
3445 
3446 {
3447   CleanFlagPtr  cfp;
3448   CitGenPtr     cgp = NULL;
3449   Boolean       hasUnpublished = FALSE;
3450   ValNodePtr    vnp;
3451 
3452   if (pdp == NULL) return;
3453   cfp = (CleanFlagPtr) userdata;
3454   if (cfp == NULL) return;
3455 
3456   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3457     if (vnp->choice == PUB_Gen) {
3458       cgp = (CitGenPtr) vnp->data.ptrvalue;
3459       if (cgp != NULL) {
3460         if (StringICmp (cgp->cit, "Unpublished") == 0) {
3461           if (StringICmp (cgp->title, "Direct Submission") != 0) {
3462             hasUnpublished = TRUE;
3463           }
3464         }
3465       }
3466     } else if (vnp->choice == PUB_Muid || vnp->choice == PUB_PMid) {
3467       return;
3468     } else if (vnp->choice == PUB_Article || vnp->choice == PUB_Book || vnp->choice == PUB_Man) {
3469       return;
3470     }
3471   }
3472 
3473   if (! hasUnpublished) return;
3474   if (cgp == NULL) return;
3475 
3476   (cfp->unpubcount)++;
3477 }
3478 
3479 static void ProcessUnpubPub (
3480   PubdescPtr pdp,
3481   Pointer userdata
3482 )
3483 
3484 {
3485   Char          buf [521];
3486   CleanFlagPtr  cfp;
3487   CitGenPtr     cgp = NULL;
3488   DatePtr       dp = NULL;
3489   Boolean       hasUnpublished = FALSE;
3490   PubRef        pr;
3491   CharPtr       tmp;
3492   ValNodePtr    vnp, vnpcgp = NULL;
3493   Int2          year = 0;
3494 
3495   if (pdp == NULL) return;
3496   cfp = (CleanFlagPtr) userdata;
3497   if (cfp == NULL) return;
3498 
3499   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3500     if (vnp->choice == PUB_Gen) {
3501       cgp = (CitGenPtr) vnp->data.ptrvalue;
3502       if (cgp != NULL) {
3503         if (StringICmp (cgp->cit, "Unpublished") == 0) {
3504           if (StringICmp (cgp->title, "Direct Submission") != 0) {
3505             hasUnpublished = TRUE;
3506             vnpcgp = vnp;
3507           }
3508         }
3509       }
3510     } else if (vnp->choice == PUB_Muid || vnp->choice == PUB_PMid) {
3511       return;
3512     } else if (vnp->choice == PUB_Article || vnp->choice == PUB_Book || vnp->choice == PUB_Man) {
3513       return;
3514     }
3515   }
3516 
3517   if (! hasUnpublished) return;
3518   if (cgp == NULL) return;
3519 
3520   MemSet ((Pointer) &pr, 0, sizeof (PubRef));
3521 
3522   pr.authors = GetAuthorMLNameList (cgp->authors);
3523   pr.titlewords = GetTitleWords (cgp->title);
3524   if (vnpcgp != NULL) {
3525     if (PubLabelUnique (vnpcgp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) {
3526       pr.uniquestr = StringSaveNoNull (buf);
3527     }
3528   }
3529 
3530   tmp = StringSave (cgp->title);
3531   TrimSpacesAndJunkFromEnds (tmp, TRUE);
3532   s_RemovePeriodFromEnd (tmp);
3533   StrStripBrackets (tmp);
3534   StrStripSpaces (tmp);
3535   pr.fulltitle = tmp;
3536 
3537   pr.journal = GetBestJournal (cgp->journal);
3538   pr.imp = NULL;
3539 
3540   dp = cgp->date;
3541   if (dp != NULL && dp->data [0] == 1) {
3542     year = (Int2) dp->data [1] + 1900;
3543   }
3544   if (year == 0) {
3545     year = cfp->year;
3546   }
3547   pr.year = year;
3548   pr.pmid = 0;
3549 
3550   if (pr.authors != NULL && pr.titlewords != NULL) {
3551     TryEntrezQueries (cfp, &pr);
3552   }
3553 
3554   ValNodeFreeData (pr.authors);
3555   ValNodeFreeData (pr.titlewords);
3556   MemFree (pr.fulltitle);
3557   MemFree (pr.uniquestr);
3558 }
3559 
3560 static void DoUnpublishedReport (
3561   SeqEntryPtr sep,
3562   CleanFlagPtr cfp
3563 )
3564 
3565 {
3566   if (sep == NULL || cfp == NULL) return;
3567 
3568   cfp->unpubcount = 0;
3569   VisitPubdescsInSep (sep, (Pointer) cfp, CountUnpubPub);
3570   VisitPubdescsInSep (sep, (Pointer) cfp, ProcessUnpubPub);
3571 }
3572 
3573 static void CountPublishedPub (
3574   PubdescPtr pdp,
3575   Pointer userdata
3576 )
3577 
3578 {
3579   CleanFlagPtr  cfp;
3580   CitArtPtr     cap = NULL;
3581   ValNodePtr    vnp;
3582 
3583   if (pdp == NULL) return;
3584   cfp = (CleanFlagPtr) userdata;
3585   if (cfp == NULL) return;
3586 
3587   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3588     if (vnp->choice == PUB_Article) {
3589       cap = (CitArtPtr) vnp->data.ptrvalue;
3590     } else if (vnp->choice == PUB_PMid) {
3591       return;
3592     }
3593   }
3594 
3595   if (cap == NULL) return;
3596 
3597   (cfp->unpubcount)++;
3598 }
3599 
3600 static void ProcessPublishedPub (
3601   PubdescPtr pdp,
3602   Pointer userdata
3603 )
3604 
3605 {
3606   CleanFlagPtr  cfp;
3607   CitArtPtr     cap = NULL;
3608   CitJourPtr    cjp;
3609   DatePtr       dp = NULL;
3610   ImprintPtr    imp;
3611   PubRef        pr;
3612   CharPtr       str;
3613   CharPtr       tmp;
3614   ValNodePtr    vnp;
3615   Int2          year = 0;
3616 
3617   if (pdp == NULL) return;
3618   cfp = (CleanFlagPtr) userdata;
3619   if (cfp == NULL) return;
3620 
3621   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3622     if (vnp->choice == PUB_Article) {
3623       cap = (CitArtPtr) vnp->data.ptrvalue;
3624     } else if (vnp->choice == PUB_PMid) {
3625       return;
3626     }
3627   }
3628 
3629   if (cap == NULL) return;
3630 
3631   MemSet ((Pointer) &pr, 0, sizeof (PubRef));
3632 
3633   pr.authors = GetAuthorMLNameList (cap->authors);
3634   for (vnp = cap->title; vnp != NULL; vnp = vnp->next) {
3635     if (vnp->choice == Cit_title_name) {
3636       str = (CharPtr) vnp->data.ptrvalue;
3637       if (StringHasNoText (str)) continue;
3638       pr.titlewords = GetTitleWords (str);
3639       tmp = StringSave (str);
3640       TrimSpacesAndJunkFromEnds (tmp, TRUE);
3641       s_RemovePeriodFromEnd (tmp);
3642       StrStripBrackets (tmp);
3643       StrStripSpaces (tmp);
3644       pr.fulltitle = tmp;
3645     }
3646   }
3647 
3648   if (cap->from == 1) {
3649     cjp = (CitJourPtr) cap->fromptr;
3650     if (cjp != NULL) {
3651       pr.journal = GetBestJournal (cjp->title);
3652       imp = cjp->imp;
3653       pr.imp = imp;
3654       if (imp != NULL) {
3655         dp = imp->date;
3656         if (dp != NULL && dp->data [0] == 1) {
3657           year = (Int2) dp->data [1] + 1900;
3658         }
3659       }
3660     }
3661   }
3662 
3663   if (year == 0) {
3664     year = cfp->year;
3665   }
3666   pr.year = year;
3667   pr.pmid = 0;
3668 
3669   if (pr.authors != NULL && pr.titlewords != NULL) {
3670     TryEntrezQueries (cfp, &pr);
3671   }
3672 
3673   ValNodeFreeData (pr.authors);
3674   ValNodeFreeData (pr.titlewords);
3675   MemFree (pr.fulltitle);
3676 }
3677 
3678 static void DoPublishedReport (
3679   SeqEntryPtr sep,
3680   CleanFlagPtr cfp
3681 )
3682 
3683 {
3684   if (sep == NULL || cfp == NULL) return;
3685 
3686   cfp->unpubcount = 0;
3687   VisitPubdescsInSep (sep, (Pointer) cfp, CountPublishedPub);
3688   VisitPubdescsInSep (sep, (Pointer) cfp, ProcessPublishedPub);
3689 }
3690 
3691 static void RemoveFeatureCitations (
3692   SeqFeatPtr sfp,
3693   Pointer userdata
3694 )
3695 
3696 {
3697   if (sfp == NULL || sfp->cit == NULL) return;
3698 
3699   sfp->cit = PubSetFree (sfp->cit);
3700 }
3701 
3702 #ifdef OS_UNIX
3703 static SeqEntryPtr CppBasicCleanup (
3704   SeqEntryPtr sep,
3705   CleanFlagPtr cfp
3706 )
3707 
3708 {
3709   AsnIoPtr      aip, aop;
3710   ByteStorePtr  bs1, bs2;
3711   Char          cmmd [512];
3712   SeqEntryPtr   csep, nsep;
3713   Char          path1 [PATH_MAX];
3714   Char          path2 [PATH_MAX];
3715   Char          path3 [PATH_MAX];
3716 
3717   if (sep == NULL || cfp == NULL) return NULL;
3718 
3719   VisitFeaturesInSep (sep, NULL, RemoveFeatureCitations);
3720 
3721   TmpNam (path1);
3722   TmpNam (path2);
3723   TmpNam (path3);
3724 
3725   aop = AsnIoOpen (path1, "w");
3726   SeqEntryAsnWrite (sep, aop, NULL);
3727   AsnIoClose (aop);
3728 
3729   sprintf (cmmd, "%s -i %s | cleanasn -a e -o %s",
3730            "~/ncbi_cxx/compilers/xCode/build/bin/Debug/test_basic_cleanup",
3731            path1, path2);
3732   system (cmmd);
3733 
3734   sprintf (cmmd, "cleanasn -i %s -o %s -K b",
3735            path1, path3);
3736   system (cmmd);
3737 
3738   aip = AsnIoOpen (path3, "r");
3739   csep = SeqEntryAsnRead (aip, NULL);
3740   AsnIoClose (aip);
3741 
3742   bs1 = Se2Bs (csep);
3743 
3744   aip = AsnIoOpen (path2, "r");
3745   nsep = SeqEntryAsnRead (aip, NULL);
3746   AsnIoClose (aip);
3747 
3748   bs2 = Se2Bs (nsep);
3749 
3750   if (nsep == NULL) {
3751     if (cfp->logfp != NULL) {
3752       fprintf (cfp->logfp, "EMPTY %s\n", cfp->buf);
3753       fflush (cfp->logfp);
3754     }
3755   } else if (! BSEqual (bs1, bs2)) {
3756     if (cfp->logfp != NULL) {
3757       fprintf (cfp->logfp, "BSEC DIFF %s\n", cfp->buf);
3758       fflush (cfp->logfp);
3759     }
3760     if (cfp->gi > 0) {
3761       sprintf (cmmd, "echo '' >> ~/Desktop/diffclean.txt");
3762       system (cmmd);
3763       sprintf (cmmd, "echo '' >> ~/Desktop/diffclean.txt");
3764       system (cmmd);
3765       sprintf (cmmd, "echo '********** gi|%ld **********' >> ~/Desktop/diffclean.txt", (long) cfp->gi);
3766       system (cmmd);
3767       sprintf (cmmd, "echo '' >> ~/Desktop/diffclean.txt");
3768       system (cmmd);
3769       sprintf (cmmd, "diff %s %s >> ~/Desktop/diffclean.txt", path3, path2);
3770       system (cmmd);
3771     }
3772   }
3773 
3774   BSFree (bs1);
3775   BSFree (bs2);
3776 
3777   SeqEntryFree (csep);
3778 
3779   sprintf (cmmd, "rm %s; rm %s; rm %s", path1, path2, path3);
3780   system (cmmd);
3781 
3782   return nsep;
3783 }
3784 #endif
3785 
3786 /* now only strips serials for local, general, refseq, and 2+6 genbank ids */
3787 static void CheckForSwissProtIDX (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
3788 
3789 {
3790   BioseqPtr     bsp;
3791   SeqIdPtr      sip;
3792   BoolPtr       stripSerial;
3793   TextSeqIdPtr  tsip;
3794 
3795   if (sep == NULL) return;
3796   if (IS_Bioseq (sep)) {
3797     bsp = (BioseqPtr) sep->data.ptrvalue;
3798     if (bsp == NULL) return;
3799     stripSerial = (BoolPtr) mydata;
3800     if (stripSerial == NULL) return;
3801     for (sip = bsp->id; sip != NULL; sip = sip->next) {
3802       switch (sip->choice) {
3803         case SEQID_GIBBSQ :
3804         case SEQID_GIBBMT :
3805           *stripSerial = FALSE;
3806           break;
3807         case SEQID_EMBL :
3808         case SEQID_PIR :
3809         case SEQID_SWISSPROT :
3810         case SEQID_PATENT :
3811         case SEQID_DDBJ :
3812         case SEQID_PRF :
3813         case SEQID_PDB :
3814         case SEQID_TPE:
3815         case SEQID_TPD:
3816         case SEQID_GPIPE:
3817           *stripSerial = FALSE;
3818           break;
3819         case SEQID_GENBANK :
3820         case SEQID_TPG:
3821           tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3822           if (tsip != NULL) {
3823             if (StringLen (tsip->accession) == 6) {
3824               *stripSerial = FALSE;
3825             }
3826           }
3827           break;
3828         case SEQID_NOT_SET :
3829         case SEQID_LOCAL :
3830         case SEQID_OTHER :
3831         case SEQID_GENERAL :
3832           break;
3833         default :
3834           break;
3835       }
3836     }
3837   }
3838 }
3839 
3840 static time_t DoCleanup (
3841   SeqEntryPtr sep,
3842   Uint2 entityID,
3843   CleanFlagPtr cfp,
3844   AsnIoPtr aop,
3845   AsnTypePtr atp,
3846   SeqSubmitPtr ssp
3847 )
3848 
3849 {
3850   BioseqPtr    bsp;
3851   DatePtr      dp;
3852   SeqEntryPtr  fsep, nsep = NULL;
3853   Boolean      isDdbj = FALSE, isEmbl = FALSE, isGenBank = FALSE, isNcbi = FALSE, isRefSeq = FALSE;
3854   Int4         nucs, prts;
3855   SumDataPtr   sdp;
3856   SeqIdPtr     sip, siphead;
3857   time_t       starttime, stoptime;
3858   Boolean      stripSerial = TRUE;
3859   SeqDescrPtr  vnp;
3860 
3861   if (sep == NULL || cfp == NULL) return 0;
3862 
3863   AssignIDsInEntityEx (entityID, 0, NULL, NULL);
3864 
3865   starttime = GetSecs ();
3866 
3867   StringCpy (cfp->buf, "");
3868   cfp->gi = 0;
3869   cfp->year = 0;
3870   cfp->isRefSeq = FALSE;
3871 
3872   fsep = FindNthBioseq (sep, 1);
3873   if (fsep != NULL && fsep->choice == 1) {
3874     bsp = (BioseqPtr) fsep->data.ptrvalue;
3875     if (bsp != NULL) {
3876       siphead = SeqIdSetDup (bsp->id);
3877       for (sip = siphead; sip != NULL; sip = sip->next) {
3878         SeqIdStripLocus (sip);
3879         if (sip->choice == SEQID_GI) {
3880           cfp->gi = (Int4) sip->data.intvalue;
3881         } else if (sip->choice == SEQID_GENBANK || sip->choice == SEQID_TPG) {
3882           isGenBank = TRUE;
3883           isNcbi = TRUE;
3884         } else if (sip->choice == SEQID_EMBL || sip->choice == SEQID_TPE) {
3885           isEmbl = TRUE;
3886         } else if (sip->choice == SEQID_DDBJ || sip->choice == SEQID_TPD) {
3887           isDdbj = TRUE;
3888         } else if (sip->choice == SEQID_OTHER) {
3889           isRefSeq = TRUE;
3890           isNcbi = TRUE;
3891           cfp->isRefSeq = TRUE;
3892         }
3893       }
3894       SeqIdWrite (siphead, cfp->buf, PRINTID_FASTA_LONG, sizeof (cfp->buf));
3895       SeqIdSetFree (siphead);
3896     }
3897     vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_update_date, NULL);
3898     if (vnp == NULL) {
3899       vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_create_date, NULL);
3900     }
3901     if (vnp != NULL) {
3902       dp = (DatePtr) vnp->data.ptrvalue;
3903       if (dp != NULL && dp->data [0] == 1) {
3904         cfp->year = (Int2) dp->data [1] + 1900;
3905       }
3906     }
3907   }
3908 
3909   SeqEntryExplore (sep, (Pointer) &stripSerial, CheckForSwissProtIDX);
3910   cfp->stripSerial = stripSerial;
3911 
3912   if (StringDoesHaveText (cfp->sourcedb)) {
3913     if (StringChr (cfp->sourcedb, 'g') != NULL) {
3914       if (! isGenBank) return 0;
3915     }
3916     if (StringChr (cfp->sourcedb, 'e') != NULL) {
3917       if (! isEmbl) return 0;
3918     }
3919     if (StringChr (cfp->sourcedb, 'd') != NULL) {
3920       if (! isDdbj) return 0;
3921     }
3922     if (StringChr (cfp->sourcedb, 'r') != NULL) {
3923       if (! isRefSeq) return 0;
3924     }
3925     if (StringChr (cfp->sourcedb, 'n') != NULL) {
3926       if (! isNcbi) return 0;
3927     }
3928     if (StringChr (cfp->sourcedb, 'x') != NULL) {
3929       if (isEmbl || isDdbj) return 0;
3930     }
3931   }
3932 
3933   nucs = VisitSequencesInSep (sep, NULL, VISIT_NUCS, NULL);
3934   prts = VisitSequencesInSep (sep, NULL, VISIT_PROTS, NULL);
3935   cfp->rawcounts.nucs += nucs;
3936   cfp->rawcounts.prts += prts;
3937   (cfp->rawcounts.recs)++;
3938   cfp->cumcounts.nucs += nucs;
3939   cfp->cumcounts.prts += prts;
3940   (cfp->cumcounts.recs)++;
3941 
3942   sdp = NULL;
3943   if (isGenBank) {
3944     sdp = &(cfp->dbsums.genbank);
3945   } else if (isEmbl) {
3946     sdp = &(cfp->dbsums.embl);
3947   } else if (isDdbj) {
3948     sdp = &(cfp->dbsums.ddbj);
3949   } else if (isRefSeq) {
3950     sdp = &(cfp->dbsums.refseq);
3951   } else {
3952     sdp = &(cfp->dbsums.other);
3953   }
3954   if (sdp != NULL) {
3955     sdp->nucs += nucs;
3956     sdp->prts += prts;
3957     (sdp->recs)++;
3958   }
3959 
3960   if (StringChr (cfp->report, 'c') != NULL) {
3961     return 0;
3962   }
3963   if (StringChr (cfp->report, 'r') != NULL) {
3964     DoASNReport (sep, cfp, FALSE, FALSE);
3965     stoptime = GetSecs ();
3966     return stoptime - starttime;
3967   }
3968   if (StringChr (cfp->report, 's') != NULL) {
3969     DoASNReport (sep, cfp, TRUE, FALSE);
3970     stoptime = GetSecs ();
3971     return stoptime - starttime;
3972   }
3973   if (StringChr (cfp->report, 'n') != NULL) {
3974     DoASNReport (sep, cfp, TRUE, TRUE);
3975     stoptime = GetSecs ();
3976     return stoptime - starttime;
3977   }
3978   if (StringChr (cfp->report, 'd') != NULL) {
3979     DoAsnDiffReport (sep, cfp);
3980     stoptime = GetSecs ();
3981     return stoptime - starttime;
3982   }
3983   if (StringChr (cfp->report, 'g') != NULL) {
3984     DoGBFFReport (sep, cfp, 1);
3985     stoptime = GetSecs ();
3986     return stoptime - starttime;
3987   }
3988   if (StringChr (cfp->report, 'f') != NULL) {
3989     DoGBFFReport (sep, cfp, 2);
3990     stoptime = GetSecs ();
3991     return stoptime - starttime;
3992   }
3993   if (StringChr (cfp->report, 'v') != NULL) {
3994     DoValidatorReport (sep, cfp->logfp, cfp->buf, cfp->asnval);
3995     stoptime = GetSecs ();
3996     return stoptime - starttime;
3997   }
3998   if (StringChr (cfp->report, 'm') != NULL) {
3999     DoModernizeReport (sep, cfp);
4000     stoptime = GetSecs ();
4001     return stoptime - starttime;
4002   }
4003   if (StringChr (cfp->report, 'u') != NULL) {
4004     DoUnpublishedReport (sep, cfp);
4005     stoptime = GetSecs ();
4006     return stoptime - starttime;
4007   }
4008   if (StringChr (cfp->report, 'p') != NULL) {
4009     DoPublishedReport (sep, cfp);
4010     stoptime = GetSecs ();
4011     return stoptime - starttime;
4012   }
4013 
4014   if (StringDoesHaveText (cfp->report)) return 0;
4015 
4016   if (cfp->logfp != NULL) {
4017     fprintf (cfp->logfp, "%s\n", cfp->buf);
4018     fflush (cfp->logfp);
4019   }
4020 
4021   if (StringChr (cfp->clean, 'b') != NULL) {
4022     BasicSeqEntryCleanup (sep);
4023   }
4024 #ifdef OS_UNIX
4025   if (StringChr (cfp->clean, 'p') != NULL) {
4026     nsep = CppBasicCleanup (sep, cfp);
4027   }
4028 #endif
4029   if (StringChr (cfp->clean, 's') != NULL) {
4030     SeriousSeqEntryCleanup (sep, NULL, NULL);
4031   }
4032   if (StringChr (cfp->clean, 'g') != NULL) {
4033     GpipeSeqEntryCleanup (sep);
4034   }
4035   if (StringChr (cfp->clean, 'n') != NULL) {
4036     NormalizeDescriptorOrder (sep);
4037   }
4038   if (StringChr (cfp->clean, 'u') != NULL) {
4039     RemoveAllNcbiCleanupUserObjects (sep);
4040   }
4041 
4042   if (StringChr (cfp->modernize, 'g') != NULL) {
4043     VisitFeaturesInSep (sep, NULL, ModGenes);
4044   }
4045   if (StringChr (cfp->modernize, 'r') != NULL) {
4046     VisitFeaturesInSep (sep, NULL, ModRNAs);
4047   }
4048   if (StringChr (cfp->modernize, 'p') != NULL) {
4049     VisitBioSourcesInSep (sep, NULL, ModPCRs);
4050   }
4051 
4052   if (cfp->taxon) {
4053     Taxon3ReplaceOrgInSeqEntry (sep, FALSE);
4054   }
4055 
4056   if (cfp->pub) {
4057     VisitPubdescsInSep (sep, NULL, LookupPubdesc);
4058   }
4059 
4060   if (StringChr (cfp->link, 'o') != NULL) {
4061     SeqMgrIndexFeatures (entityID, 0);
4062     LinkCDSmRNAbyOverlap (sep);
4063   }
4064   if (StringChr (cfp->link, 'p') != NULL) {
4065     SeqMgrIndexFeatures (entityID, 0);
4066     LinkCDSmRNAbyProduct (sep);
4067   }
4068   if (StringChr (cfp->link, 'r') != NULL) {
4069     SeqMgrIndexFeatures (entityID, 0);
4070     ReassignFeatureIDs (sep);
4071   }
4072   if (StringChr (cfp->link, 'c') != NULL) {
4073     ClearFeatureIDs (sep);
4074   }
4075 
4076   if (StringChr (cfp->feat, 'u') != NULL) {
4077     VisitFeaturesInSep (sep, NULL, RemoveFeatUser);
4078   }
4079   if (StringChr (cfp->feat, 'd') != NULL) {
4080     VisitFeaturesInSep (sep, NULL, RemoveFeatDbxref);
4081   }
4082   if (StringChr (cfp->feat, 'r') != NULL) {
4083     SeqMgrIndexFeatures (entityID, 0);
4084     VisitFeaturesInSep (sep, NULL, RemoveUnnecGeneXref);
4085   }
4086 
4087   if (StringChr (cfp->desc, 't') != NULL) {
4088     VisitDescriptorsInSep (sep, NULL, MarkTitles);
4089     DeleteMarkedObjects (entityID, 0, NULL);
4090   }
4091 
4092   if (StringChr (cfp->mods, 'd') != NULL) {
4093     SeqMgrIndexFeatures (entityID, 0);
4094     DoAutoDef (sep, entityID);
4095   }
4096 
4097   if (cfp->action_list != NULL) {
4098     ApplyMacroToSeqEntry (sep, cfp->action_list, NULL, NULL);
4099   }
4100 
4101   stoptime = GetSecs ();
4102 
4103   if (aop != NULL) {
4104     if (ssp != NULL) {
4105       SeqSubmitAsnWrite (ssp, aop, atp);
4106     } else if (nsep != NULL) {
4107       SeqEntryAsnWrite (nsep, aop, atp);
4108       SeqEntryFree (nsep);
4109     } else {
4110       SeqEntryAsnWrite (sep, aop, atp);
4111     }
4112   }
4113 
4114   return stoptime - starttime;
4115 }
4116 
4117 static void CleanupSingleRecord (
4118   CharPtr filename,
4119   CleanFlagPtr cfp
4120 )
4121 
4122 {
4123   AsnIoPtr      aip, aop = NULL;
4124   BioseqPtr     bsp;
4125   BioseqSetPtr  bssp;
4126   Pointer       dataptr = NULL;
4127   Uint2         datatype, entityID = 0;
4128   FILE          *fp;
4129   Char          path [PATH_MAX];
4130   CharPtr       ptr;
4131   SeqEntryPtr   sep;
4132   SeqSubmitPtr  ssp = NULL;
4133 
4134   if (cfp == NULL) return;
4135 
4136   if (StringHasNoText (filename)) return;
4137 
4138   if (cfp->type == 1) {
4139     fp = FileOpen (filename, "r");
4140     if (fp == NULL) {
4141       Message (MSG_POSTERR, "Failed to open '%s'", filename);
4142       return;
4143     }
4144 
4145     dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE);
4146 
4147     FileClose (fp);
4148 
4149     entityID = ObjMgrRegister (datatype, dataptr);
4150 
4151   } else if (cfp->type >= 2 && cfp->type <= 5) {
4152     aip = AsnIoOpen (filename, cfp->binary? "rb" : "r");
4153     if (aip == NULL) {
4154       Message (MSG_POSTERR, "AsnIoOpen failed for input file '%s'", filename);
4155       return;
4156     }
4157 
4158     SeqMgrHoldIndexing (TRUE);
4159     switch (cfp->type) {
4160       case 2 :
4161         dataptr = (Pointer) SeqEntryAsnRead (aip, NULL);
4162         datatype = OBJ_SEQENTRY;
4163         break;
4164       case 3 :
4165         dataptr = (Pointer) BioseqAsnRead (aip, NULL);
4166         datatype = OBJ_BIOSEQ;
4167         break;
4168       case 4 :
4169         dataptr = (Pointer) BioseqSetAsnRead (aip, NULL);
4170         datatype = OBJ_BIOSEQSET;
4171         break;
4172       case 5 :
4173         dataptr = (Pointer) SeqSubmitAsnRead (aip, NULL);
4174         ssp = (SeqSubmitPtr) dataptr;
4175         datatype = OBJ_SEQSUB;
4176         break;
4177       default :
4178         break;
4179     }
4180     SeqMgrHoldIndexing (FALSE);
4181 
4182     AsnIoClose (aip);
4183 
4184     entityID = ObjMgrRegister (datatype, dataptr);
4185 
4186   } else {
4187     Message (MSG_POSTERR, "Input format type '%d' unrecognized", (int) cfp->type);
4188     return;
4189   }
4190 
4191   if (entityID < 1 || dataptr == NULL) {
4192     Message (MSG_POSTERR, "Data read failed for input file '%s'", filename);
4193     return;
4194   }
4195 
4196   if (datatype == OBJ_SEQSUB || datatype == OBJ_SEQENTRY ||
4197         datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) {
4198 
4199     sep = GetTopSeqEntryForEntityID (entityID);
4200 
4201     if (sep == NULL) {
4202       sep = SeqEntryNew ();
4203       if (sep != NULL) {
4204         if (datatype == OBJ_BIOSEQ) {
4205           bsp = (BioseqPtr) dataptr;
4206           sep->choice = 1;
4207           sep->data.ptrvalue = bsp;
4208           SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
4209         } else if (datatype == OBJ_BIOSEQSET) {
4210           bssp = (BioseqSetPtr) dataptr;
4211           sep->choice = 2;
4212           sep->data.ptrvalue = bssp;
4213           SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
4214         } else {
4215           sep = SeqEntryFree (sep);
4216         }
4217       }
4218       sep = GetTopSeqEntryForEntityID (entityID);
4219     }
4220 
4221     if (sep != NULL) {
4222 
4223       path [0] = '\0';
4224       if (StringDoesHaveText (cfp->outfile)) {
4225 
4226         StringNCpy_0 (path, cfp->outfile, sizeof (path));
4227       
4228       } else if (StringDoesHaveText (cfp->results)) {
4229 
4230         ptr = StringRChr (filename, DIRDELIMCHR);
4231         if (ptr != NULL) {
4232           StringNCpy_0 (path, cfp->results, sizeof (path));
4233           ptr++;
4234           FileBuildPath (path, NULL, ptr);
4235         }
4236       }
4237 
4238       sep = GetTopSeqEntryForEntityID (entityID);
4239       if (sep != NULL) {
4240 
4241         if (StringHasNoText (cfp->report) && StringDoesHaveText (path)) {
4242           aop = AsnIoOpen (path, "w");
4243         }
4244 
4245         DoCleanup (sep, entityID, cfp, aop, NULL, ssp);
4246 
4247         if (aop != NULL) {
4248           AsnIoFlush (aop);
4249           AsnIoClose (aop);
4250         }
4251       }
4252 
4253       ObjMgrFreeByEntityID (entityID);
4254     }
4255 
4256   } else {
4257 
4258     Message (MSG_POSTERR, "Datatype %d not recognized", (int) datatype);
4259   }
4260 }
4261 
4262 static void CleanupMultipleRecord (
4263   CharPtr filename,
4264   CleanFlagPtr cfp
4265 )
4266 
4267 {
4268   AsnIoPtr     aip, aop = NULL;
4269   AsnTypePtr   atp;
4270   DataVal      av;
4271   Char         ch;
4272   Uint2        entityID;
4273   FILE         *fp;
4274   size_t       len;
4275   Char         longest [64];
4276   Int4         numrecords;
4277   Char         path [PATH_MAX];
4278   CharPtr      ptr;
4279   SeqEntryPtr  sep;
4280   time_t       timediff, worsttime;
4281 #ifdef OS_UNIX
4282   Char         cmmd [512];
4283   CharPtr      gzcatprog;
4284   int          ret;
4285   Boolean      usedPopen = FALSE;
4286 #endif
4287 
4288   if (cfp == NULL) return;
4289 
4290   if (StringHasNoText (filename)) return;
4291 
4292   path [0] = '\0';
4293   if (StringDoesHaveText (cfp->outfile)) {
4294 
4295     StringNCpy_0 (path, cfp->outfile, sizeof (path));
4296       
4297   } else if (StringDoesHaveText (cfp->results)) {
4298 
4299     ptr = StringRChr (filename, DIRDELIMCHR);
4300     if (ptr != NULL) {
4301       StringNCpy_0 (path, cfp->results, sizeof (path));
4302       ptr++;
4303       if (cfp->compressed) {
4304         len = StringLen (ptr);
4305         if (len > 4 && StringCmp (ptr + len - 3, ".gz") == 0) {
4306           ptr [len - 3] = '\0';
4307         }
4308       }
4309       FileBuildPath (path, NULL, ptr);
4310     }
4311   }
4312   if (StringHasNoText (cfp->report) && StringHasNoText (path)) return;
4313 
4314 #ifndef OS_UNIX
4315   if (cfp->compressed) {
4316     Message (MSG_POSTERR, "Can only decompress on-the-fly on UNIX machines");
4317     return;
4318   }
4319 #endif
4320 
4321 #ifdef OS_UNIX
4322   if (cfp->compressed) {
4323     gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
4324     if (gzcatprog != NULL) {
4325       sprintf (cmmd, "%s %s", gzcatprog, filename);
4326     } else {
4327       ret = system ("gzcat -h >/dev/null 2>&1");
4328       if (ret == 0) {
4329         sprintf (cmmd, "gzcat %s", filename);
4330       } else if (ret == -1) {
4331         Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
4332         return;
4333       } else {
4334         ret = system ("zcat -h >/dev/null 2>&1");
4335         if (ret == 0) {
4336           sprintf (cmmd, "zcat %s", filename);
4337         } else if (ret == -1) {
4338           Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease");
4339           return;
4340         } else {
4341           Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
4342           return;
4343         }
4344       }
4345     }
4346     fp = popen (cmmd, /* cfp->binary? "rb" : */ "r");
4347     usedPopen = TRUE;
4348   } else {
4349     fp = FileOpen (filename, cfp->binary? "rb" : "r");
4350   }
4351 #else
4352   fp = FileOpen (filename, cfp->binary? "rb" : "r");
4353 #endif
4354   if (fp == NULL) {
4355     Message (MSG_POSTERR, "FileOpen failed for input file '%s'", filename);
4356     return;
4357   }
4358 
4359   aip = AsnIoNew (cfp->binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
4360   if (aip == NULL) {
4361     Message (MSG_ERROR, "AsnIoNew failed for input file '%s'", filename);
4362     return;
4363   }
4364 
4365   if (cfp->logfp != NULL) {
4366     if (StringChr (cfp->report, 'c') == NULL) {
4367       fprintf (cfp->logfp, "%s\n\n", filename);
4368       fflush (cfp->logfp);
4369     }
4370   }
4371 
4372   longest [0] = '\0';
4373   worsttime = 0;
4374   numrecords = 0;
4375 
4376   if (StringHasNoText (cfp->report)) {
4377     aop = AsnIoOpen (path, cfp->binary? "wb" : "w");
4378     if (aop != NULL) {
4379       AsnOpenStruct (aop, cfp->bssp_atp, (Pointer) &(cfp->bss));
4380       av.intvalue = 7;
4381       AsnWrite (aop, cfp->atp_bsc, &av);
4382       AsnOpenStruct (aop, cfp->atp_bsss, (Pointer) &(cfp->bss.seq_set));
4383     }
4384   }
4385 
4386   atp = cfp->atp_bss;
4387 
4388   while ((atp = AsnReadId (aip, cfp->amp, atp)) != NULL) {
4389     if (atp == cfp->atp_se) {
4390 
4391       SeqMgrHoldIndexing (TRUE);
4392       sep = SeqEntryAsnRead (aip, atp);
4393       SeqMgrHoldIndexing (FALSE);
4394 
4395       if (sep != NULL) {
4396 
4397         entityID = ObjMgrGetEntityIDForChoice (sep);
4398 
4399         timediff = DoCleanup (sep, entityID, cfp, aop, cfp->atp_se, NULL);
4400 
4401         if (timediff > worsttime) {
4402           worsttime = timediff;
4403           StringCpy (longest, cfp->buf);
4404           ptr = longest;
4405           ch = *ptr;
4406           while (ch != '\0') {
4407             if (ch == '|') {
4408               *ptr = ' ';
4409             }
4410             ptr++;
4411             ch = *ptr;
4412           }
4413         }
4414         numrecords++;
4415 
4416         ObjMgrFreeByEntityID (entityID);
4417       }
4418 
4419     } else {
4420 
4421       AsnReadVal (aip, atp, NULL);
4422     }
4423   }
4424 
4425   if (aop != NULL) {
4426     AsnCloseStruct (aop, cfp->atp_bsss, (Pointer) &(cfp->bss.seq_set));
4427     AsnCloseStruct (aop, cfp->bssp_atp, (Pointer) &(cfp->bss));
4428     AsnIoClose (aop);
4429   }
4430 
4431   AsnIoFree (aip, FALSE);
4432 
4433 #ifdef OS_UNIX
4434   if (usedPopen) {
4435     pclose (fp);
4436   } else {
4437     FileClose (fp);
4438   }
4439 #else
4440   FileClose (fp);
4441 #endif
4442   if (cfp->logfp != NULL) {
4443     if (StringChr (cfp->report, 'c') == NULL) {
4444       fprintf (cfp->logfp, "\nTotal number of records %ld\n", (long) numrecords);
4445       if (StringDoesHaveText (longest)) {
4446         fprintf (cfp->logfp, "Longest processing time %ld seconds on %s\n",
4447                  (long) worsttime, longest);
4448       }
4449       fprintf (cfp->logfp, "Counts ");
4450       fprintf (cfp->logfp, "- %9ld RECS", (long) cfp->rawcounts.recs);
4451       fprintf (cfp->logfp, ", %9ld NUCS", (long) cfp->rawcounts.nucs);
4452       fprintf (cfp->logfp, ", %9ld PRTS", (long) cfp->rawcounts.prts);
4453       fprintf (cfp->logfp, ", %9ld OKAY", (long) cfp->rawcounts.okay);
4454       fprintf (cfp->logfp, ", %9ld NORM", (long) cfp->rawcounts.norm);
4455       fprintf (cfp->logfp, ", %9ld CLNR", (long) cfp->rawcounts.clnr);
4456       fprintf (cfp->logfp, ", %9ld OTHR", (long) cfp->rawcounts.othr);
4457       fprintf (cfp->logfp, ", %9ld MODR", (long) cfp->rawcounts.modr);
4458       fprintf (cfp->logfp, ", %9ld SLOC", (long) cfp->rawcounts.sloc);
4459       fprintf (cfp->logfp, ", %9ld PUBL", (long) cfp->rawcounts.publ);
4460       fprintf (cfp->logfp, ", %9ld AUTH", (long) cfp->rawcounts.auth);
4461       fprintf (cfp->logfp, ", %9ld SORT", (long) cfp->rawcounts.sort);
4462       fprintf (cfp->logfp, ", %9ld BSEC", (long) cfp->rawcounts.bsec);
4463       fprintf (cfp->logfp, ", %9ld GBBK", (long) cfp->rawcounts.gbbk);
4464       fprintf (cfp->logfp, ", %9ld TITL", (long) cfp->rawcounts.titl);
4465       fprintf (cfp->logfp, ", %9ld PACK", (long) cfp->rawcounts.pack);
4466       fprintf (cfp->logfp, ", %9ld MOVE", (long) cfp->rawcounts.move);
4467       fprintf (cfp->logfp, ", %9ld SSEC", (long) cfp->rawcounts.ssec);
4468       fprintf (cfp->logfp, "\n");
4469       fflush (cfp->logfp);
4470     }
4471   }
4472 }
4473 
4474 static void CleanupOneRecord (
4475   CharPtr filename,
4476   Pointer userdata
4477 )
4478 
4479 {
4480   CleanFlagPtr  cfp;
4481   CharPtr       ptr;
4482   SumDataPtr    sdp;
4483 
4484   if (StringHasNoText (filename)) return;
4485   cfp = (CleanFlagPtr) userdata;
4486   if (cfp == NULL) return;
4487 
4488   MemSet ((Pointer) &(cfp->rawcounts), 0, sizeof (CountData));
4489   MemSet ((Pointer) &(cfp->dbsums), 0, sizeof (DbSumData));
4490 
4491   if (StringChr (cfp->sourcedb, 'y') != NULL) {
4492     ptr = StringRChr (filename, DIRDELIMCHR);
4493     if (ptr != NULL) {
4494       ptr++;
4495       if (StringStr (ptr, "gbcon") != NULL ||
4496           StringStr (ptr, "gbest") != NULL ||
4497           StringStr (ptr, "gbgss") != NULL ||
4498           StringStr (ptr, "gbhtg") != NULL ||
4499           StringStr (ptr, "gbpat") != NULL ||
4500           StringStr (ptr, "gbsts") != NULL) return;
4501     }
4502   }
4503 
4504   if (cfp->batch) {
4505     ptr = StringRChr (filename, DIRDELIMCHR);
4506     if (ptr != NULL) {
4507       ptr++;
4508       if (StringDoesHaveText (cfp->firstfile)) {
4509         if (StringICmp (cfp->firstfile, ptr) == 0) {
4510           cfp->foundfirst = TRUE;
4511         }
4512         if (! cfp->foundfirst) return;
4513       }
4514 
4515       if (StringDoesHaveText (cfp->lastfile)) {
4516         if (cfp->foundlast) return;
4517         if (StringICmp (cfp->lastfile, ptr) == 0) {
4518           cfp->foundlast = TRUE;
4519         }
4520       }
4521     }
4522 
4523     CleanupMultipleRecord (filename, cfp);
4524   } else {
4525     CleanupSingleRecord (filename, cfp);
4526   }
4527 
4528   if (cfp->logfp != NULL) {
4529     if (StringChr (cfp->report, 'c') != NULL) {
4530       ptr = StringRChr (filename, DIRDELIMCHR);
4531       if (ptr != NULL) {
4532         ptr++;
4533         fprintf (cfp->logfp, "%s", ptr);
4534       }
4535       sdp = &(cfp->dbsums.genbank);
4536       if (sdp != NULL) {
4537         fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4538       }
4539       sdp = &(cfp->dbsums.embl);
4540       if (sdp != NULL) {
4541         fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4542       }
4543       sdp = &(cfp->dbsums.ddbj);
4544       if (sdp != NULL) {
4545         fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4546       }
4547       sdp = &(cfp->dbsums.refseq);
4548       if (sdp != NULL) {
4549         fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4550       }
4551       sdp = &(cfp->dbsums.other);
4552       if (sdp != NULL) {
4553         fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) sdp->recs, (long) sdp->nucs, (long) sdp->prts);
4554       }
4555       fprintf (cfp->logfp, "\t%ld\t%ld\t%ld", (long) cfp->rawcounts.recs, (long) cfp->rawcounts.nucs, (long) cfp->rawcounts.prts);
4556       fprintf (cfp->logfp, "\n");
4557       fflush (cfp->logfp);
4558     }
4559   }
4560 }
4561 
4562 /* Args structure contains command-line arguments */
4563 
4564 typedef enum {
4565   p_argInputPath = 0,
4566   r_argOutputPath,
4567   i_argInputFile,
4568   o_argOutputFile,
4569   f_argFilter,
4570   x_argSuffix,
4571   j_argFirstFile,
4572   k_argLastFile,
4573   d_argSourceDb,
4574   a_argType,
4575   b_argBinary,
4576   c_argCompressed,
4577   L_argLogFile,
4578   R_argRemote,
4579   Q_argReport,
4580   S_argSelective,
4581   m_argFfMode,
4582   q_argFfDiff,
4583   n_argAsn2Flat,
4584   v_argAsnVal,
4585   K_argClean,
4586   U_argModernize,
4587   N_argLink,
4588   F_argFeat,
4589   D_argDesc,
4590   X_argMods,
4591   M_argMacro,
4592   T_argTaxonLookup,
4593   P_argPubLookup
4594 } Arguments;
4595 
4596 Args myargs [] = {
4597   {"Path to Files", NULL, NULL, NULL,
4598     TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
4599   {"Path for Results", NULL, NULL, NULL,
4600     TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
4601   {"Single Input File", "stdin", NULL, NULL,
4602     TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
4603   {"Single Output File", "stdout", NULL, NULL,
4604     TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
4605   {"Substring Filter", NULL, NULL, NULL,
4606     TRUE, 'f', ARG_STRING, 0.0, 0, NULL},
4607   {"File Selection Suffix", ".ent", NULL, NULL,
4608     TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
4609   {"First File Name", NULL, NULL, NULL,
4610     TRUE, 'j', ARG_STRING, 0.0, 0, NULL},
4611   {"Last File Name", NULL, NULL, NULL,
4612     TRUE, 'k', ARG_STRING, 0.0, 0, NULL},
4613   {"Source Database\n"
4614    "      a Any\n"
4615    "      g GenBank\n"
4616    "      e EMBL\n"
4617    "      d DDBJ\n"
4618    "      r RefSeq\n"
4619    "      n NCBI\n"
4620    "      x Exclude EMBL/DDBJ\n"
4621    "      y Exclude gbcon, gbest, gbgss, gbhtg, gbpat, gbsts\n", "a", NULL, NULL,
4622     TRUE, 'd', ARG_STRING, 0.0, 0, NULL},
4623   {"ASN.1 Type\n"
4624    "      a Any\n"
4625    "      e Seq-entry\n"
4626    "      b Bioseq\n"
4627    "      s Bioseq-set\n"
4628    "      m Seq-submit\n"
4629    "      t Batch Processing\n", "a", NULL, NULL,
4630     TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
4631   {"Bioseq-set is Binary", "F", NULL, NULL,
4632     TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
4633   {"Bioseq-set is Compressed", "F", NULL, NULL,
4634     TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
4635   {"Log File", NULL, NULL, NULL,
4636     TRUE, 'L', ARG_FILE_OUT, 0.0, 0, NULL},
4637   {"Remote Fetching from ID", "F", NULL, NULL,
4638     TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
4639   {"Report\n"
4640    "      c Record Count\n"
4641    "      r ASN.1 BSEC Report\n"
4642    "      s ASN.1 SSEC Report\n"
4643    "      n NORM vs. SSEC Report\n"
4644    "      d Log SSEC Differences\n"
4645    "      g GenBank SSEC Diff\n"
4646    "      f asn2gb/asn2flat Diff\n"
4647    "      v Validator SSEC Diff\n"
4648    "      m Modernize Gene/RNA/PCR\n"
4649    "      u Unpublished Pub Lookup\n"
4650    "      p Published Pub Lookup\n", NULL, NULL, NULL,
4651     TRUE, 'Q', ARG_STRING, 0.0, 0, NULL},
4652   {"Selective Difference Filter\n"
4653    "      a Author\n"
4654    "      p Publication\n"
4655    "      l Location\n"
4656    "      r RNA\n"
4657    "      s Qualifier Sort Order\n"
4658    "      g Genbank Block\n"
4659    "      k Package CdRegion or Parts Features\n"
4660    "      m Move Publication\n"
4661    "      (Capital Letters Skip)\n", NULL, NULL, NULL,
4662     TRUE, 'S', ARG_STRING, 0.0, 0, NULL},
4663   {"Flatfile Mode\n"
4664    "      r Release\n"
4665    "      e Entrez\n"
4666    "      s Sequin\n"
4667    "      d Dump\n", NULL, NULL, NULL,
4668     TRUE, 'm', ARG_STRING, 0.0, 0, NULL},
4669   {"ffdiff Executable", "/netopt/genbank/subtool/bin/ffdiff", NULL, NULL,
4670     TRUE, 'q', ARG_FILE_IN, 0.0, 0, NULL},
4671   {"asn2flat Executable", "/netopt/ncbi_tools/bin/asn2flat", NULL, NULL,
4672     TRUE, 'n', ARG_FILE_IN, 0.0, 0, NULL},
4673   {"asnval Executable", "/netopt/ncbi_tools/bin/asnval", NULL, NULL,
4674     TRUE, 'v', ARG_FILE_IN, 0.0, 0, NULL},
4675   {"Cleanup\n"
4676    "      b BasicSeqEntryCleanup\n"
4677    "      p C++ BasicCleanup\n"
4678    "      s SeriousSeqEntryCleanup\n"
4679    "      g GpipeSeqEntryCleanup\n"
4680    "      n Normalize Descriptor Order\n"
4681    "      u Remove NcbiCleanup User Objects\n", NULL, NULL, NULL,
4682     TRUE, 'K', ARG_STRING, 0.0, 0, NULL},
4683   {"Modernize\n"
4684    "      g Gene\n"
4685    "      r RNA\n"
4686    "      p PCR Primers\n", NULL, NULL, NULL,
4687     TRUE, 'U', ARG_STRING, 0.0, 0, NULL},
4688   {"Link\n"
4689    "      o LinkCDSmRNAbyOverlap\n"
4690    "      p LinkCDSmRNAbyProduct\n"
4691    "      r ReassignFeatureIDs\n"
4692    "      c ClearFeatureIDs\n", NULL, NULL, NULL,
4693     TRUE, 'N', ARG_STRING, 0.0, 0, NULL},
4694   {"Feature\n"
4695    "      u Remove User Object\n"
4696    "      d Remove db_xref\n"
4697    "      r Remove Redundant Gene xref\n", NULL, NULL, NULL,
4698     TRUE, 'F', ARG_STRING, 0.0, 0, NULL},
4699   {"Descriptor\n"
4700    "      t Remove Title\n", NULL, NULL, NULL,
4701     TRUE, 'D', ARG_STRING, 0.0, 0, NULL},
4702   {"Miscellaneous\n"
4703    "      d Automatic Definition Line\n", NULL, NULL, NULL,
4704     TRUE, 'X', ARG_STRING, 0.0, 0, NULL},
4705   {"Macro File", NULL, NULL, NULL,
4706     TRUE, 'M', ARG_FILE_IN, 0.0, 0, NULL},
4707   {"Taxonomy Lookup", "F", NULL, NULL,
4708     TRUE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},
4709   {"Publication Lookup", "F", NULL, NULL,
4710     TRUE, 'P', ARG_BOOLEAN, 0.0, 0, NULL},
4711 };
4712 
4713 Int2 Main (void)
4714 
4715 {
4716   ValNodePtr     action_list;
4717   AsnIoPtr       aip;
4718   Char           app [64], mode, type;
4719   CleanFlagData  cfd;
4720   CharPtr        directory, filter, infile, logfile, outfile,
4721                  macro_file, results, str, suffix;
4722   Boolean        remote;
4723   time_t         runtime, starttime, stoptime;
4724 
4725   /* standard setup */
4726 
4727   ErrSetFatalLevel (SEV_MAX);
4728   ErrClearOptFlags (EO_SHOW_USERSTR);
4729   UseLocalAsnloadDataAndErrMsg ();
4730   ErrPathReset ();
4731 
4732   /* finish resolving internal connections in ASN.1 parse tables */
4733 
4734   if (! AllObjLoad ()) {
4735     Message (MSG_FATAL, "AllObjLoad failed");
4736     return 1;
4737   }
4738   if (! SubmitAsnLoad ()) {
4739     Message (MSG_FATAL, "SubmitAsnLoad failed");
4740     return 1;
4741   }
4742   if (! FeatDefSetLoad ()) {
4743     Message (MSG_FATAL, "FeatDefSetLoad failed");
4744     return 1;
4745   }
4746   if (! SeqCodeSetLoad ()) {
4747     Message (MSG_FATAL, "SeqCodeSetLoad failed");
4748     return 1;
4749   }
4750   if (! GeneticCodeTableLoad ()) {
4751     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
4752     return 1;
4753   }
4754 
4755   /* process command line arguments */
4756 
4757   sprintf (app, "cleanasn %s", CLEANASN_APPLICATION);
4758   if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
4759     return 0;
4760   }
4761 
4762   MemSet ((Pointer) &cfd, 0, sizeof (CleanFlagData));
4763 
4764   directory = (CharPtr) myargs [p_argInputPath].strvalue;
4765   results = (CharPtr) myargs [r_argOutputPath].strvalue;
4766   if (StringHasNoText (results)) {
4767     results = directory;
4768   }
4769   infile = (CharPtr) myargs [i_argInputFile].strvalue;
4770   outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
4771   filter = (CharPtr) myargs [f_argFilter].strvalue;
4772   suffix = (CharPtr) myargs [x_argSuffix].strvalue;
4773 
4774   cfd.batch = FALSE;
4775   cfd.binary = (Boolean) myargs [b_argBinary].intvalue;
4776   cfd.compressed = (Boolean) myargs [c_argCompressed].intvalue;
4777   cfd.type = 1;
4778 
4779   cfd.foundfirst = FALSE;
4780   cfd.foundlast = FALSE;
4781   cfd.sourcedb = myargs [d_argSourceDb].strvalue;
4782 
4783   str = myargs [a_argType].strvalue;
4784   TrimSpacesAroundString (str);
4785   if (StringDoesHaveText (str)) {
4786     type = str [0];
4787   } else {
4788     type = 'a';
4789   }
4790 
4791   type = TO_LOWER (type);
4792   switch (type) {
4793     case 'a' :
4794       cfd.type = 1;
4795       break;
4796     case 'e' :
4797       cfd.type = 2;
4798       break;
4799     case 'b' :
4800       cfd.type = 3;
4801       break;
4802     case 's' :
4803       cfd.type = 4;
4804       break;
4805     case 'm' :
4806       cfd.type = 5;
4807       break;
4808     case 't' :
4809       cfd.type = 1;
4810       cfd.batch = TRUE;
4811       break;
4812     default :
4813       cfd.type = 1;
4814       break;
4815   }
4816 
4817   remote = (Boolean) myargs [R_argRemote].intvalue;
4818 
4819   cfd.report = myargs [Q_argReport].strvalue;
4820   cfd.selective = myargs [S_argSelective].strvalue;
4821   cfd.ffdiff = myargs [q_argFfDiff].strvalue;
4822   cfd.asn2flat = myargs [n_argAsn2Flat].strvalue;
4823   cfd.asnval = myargs [v_argAsnVal].strvalue;
4824 
4825   str = myargs [m_argFfMode].strvalue;
4826   TrimSpacesAroundString (str);
4827   if (StringDoesHaveText (str)) {
4828     mode = str [0];
4829   } else {
4830     mode = 'e';
4831   }
4832 
4833   mode = TO_LOWER (mode);
4834   switch (mode) {
4835     case 'r' :
4836       cfd.ffmode = RELEASE_MODE;
4837       break;
4838     case 'e' :
4839       cfd.ffmode = ENTREZ_MODE;
4840       break;
4841     case 's' :
4842       cfd.ffmode = SEQUIN_MODE;
4843       break;
4844     case 'd' :
4845       cfd.ffmode = DUMP_MODE;
4846       break;
4847     default :
4848       cfd.ffmode = ENTREZ_MODE;
4849       break;
4850   }
4851 
4852   cfd.clean = myargs [K_argClean].strvalue;
4853   cfd.modernize = myargs [U_argModernize].strvalue;
4854   cfd.link = myargs [N_argLink].strvalue;
4855   cfd.feat = myargs [F_argFeat].strvalue;
4856   cfd.desc = myargs [D_argDesc].strvalue;
4857   cfd.mods = myargs [X_argMods].strvalue;
4858   cfd.taxon = (Boolean) myargs [T_argTaxonLookup].intvalue;
4859   cfd.pub = (Boolean) myargs [P_argPubLookup].intvalue;
4860 
4861   macro_file = myargs [M_argMacro].strvalue;
4862   if (StringDoesHaveText (macro_file)) {
4863     aip = AsnIoOpen (macro_file, "r");
4864     if (aip == NULL) {
4865       Message (MSG_FATAL, "Unable to open macro file '%s'", macro_file);
4866       return 1;
4867     }
4868     action_list = MacroActionListAsnRead (aip, NULL);
4869     AsnIoClose (aip);
4870     if (action_list == NULL) {
4871       Message (MSG_FATAL, "Unable to read macro file '%s'", macro_file);
4872     }
4873     cfd.action_list = action_list;
4874   }
4875 
4876   cfd.amp = AsnAllModPtr ();
4877   cfd.atp_bss = AsnFind ("Bioseq-set");
4878   cfd.atp_bsss = AsnFind ("Bioseq-set.seq-set");
4879   cfd.atp_se = AsnFind ("Bioseq-set.seq-set.E");
4880   cfd.atp_bsc = AsnFind ("Bioseq-set.class");
4881   cfd.bssp_atp = AsnLinkType (NULL, cfd.atp_bss);
4882 
4883   logfile = (CharPtr) myargs [L_argLogFile].strvalue;
4884   if (StringDoesHaveText (logfile)) {
4885     cfd.logfp = FileOpen (logfile, "w");
4886   }
4887 
4888   if (remote) {
4889 #ifdef INTERNAL_NCBI_CLEANASN
4890     if (! PUBSEQBioseqFetchEnable ("cleanasn", FALSE)) {
4891       Message (MSG_POSTERR, "PUBSEQBioseqFetchEnable failed");
4892       return 1;
4893     }
4894 #else
4895     PubSeqFetchEnable ();
4896 #endif
4897   }
4898 
4899   if (remote || cfd.pub) {
4900     PubMedFetchEnable ();
4901   }
4902 
4903   /*
4904   if (cfd.logfp != NULL && StringChr (cfd.report, 'c') != NULL) {
4905     fprintf (cfd.logfp, "FILE\t\tGENBANK\t\t\tEMBL\t\t\tDDBJ\t\t\tREFSEQ\t\t\tOTHER\n");
4906     fprintf (cfd.logfp, "\tREC\tNUC\tPRT\tREC\tNUC\tPRT\tREC\tNUC\tPRT\tREC\tNUC\tPRT\tREC\tNUC\tPRT\n");
4907     fflush (cfd.logfp);
4908   }
4909   */
4910 
4911   starttime = GetSecs ();
4912 
4913   if (StringDoesHaveText (directory)) {
4914     if (StringHasNoText (cfd.report) && StringCmp (directory, results) == 0) {
4915       Message (MSG_POSTERR, "-r results path must be different than -p data path");
4916       if (cfd.logfp != NULL) {
4917         fprintf (cfd.logfp, "-r results path must be different than -p data path\n");
4918       }
4919     } else {
4920 
4921       cfd.firstfile = (CharPtr) myargs [j_argFirstFile].strvalue;
4922       cfd.lastfile = (CharPtr) myargs [k_argLastFile].strvalue;
4923 
4924       cfd.results = results;
4925 
4926       DirExplore (directory, filter, suffix, FALSE, CleanupOneRecord, (Pointer) &cfd);
4927     }
4928 
4929   } else if (StringDoesHaveText (infile) && StringDoesHaveText (outfile)) {
4930 
4931     cfd.outfile = outfile;
4932 
4933     CleanupOneRecord (infile, (Pointer) &cfd);
4934   }
4935 
4936   stoptime = GetSecs ();
4937   runtime = stoptime - starttime;
4938 
4939   if (cfd.logfp != NULL) {
4940     if (StringChr (cfd.report, 'c') == NULL) {
4941       fprintf (cfd.logfp, "\nFinished in %ld seconds\n", (long) runtime);
4942       fprintf (cfd.logfp, "Cumulative counts ");
4943       fprintf (cfd.logfp, "- %9ld RECS", (long) cfd.cumcounts.recs);
4944       fprintf (cfd.logfp, ", %9ld NUCS", (long) cfd.cumcounts.nucs);
4945       fprintf (cfd.logfp, ", %9ld PRTS", (long) cfd.cumcounts.prts);
4946       fprintf (cfd.logfp, ", %9ld OKAY", (long) cfd.cumcounts.okay);
4947       fprintf (cfd.logfp, ", %9ld NORM", (long) cfd.cumcounts.norm);
4948       fprintf (cfd.logfp, ", %9ld CLNR", (long) cfd.cumcounts.clnr);
4949       fprintf (cfd.logfp, ", %9ld OTHR", (long) cfd.cumcounts.othr);
4950       fprintf (cfd.logfp, ", %9ld MODR", (long) cfd.cumcounts.modr);
4951       fprintf (cfd.logfp, ", %9ld SLOC", (long) cfd.cumcounts.sloc);
4952       fprintf (cfd.logfp, ", %9ld PUBL", (long) cfd.cumcounts.publ);
4953       fprintf (cfd.logfp, ", %9ld AUTH", (long) cfd.cumcounts.auth);
4954       fprintf (cfd.logfp, ", %9ld SORT", (long) cfd.cumcounts.sort);
4955       fprintf (cfd.logfp, ", %9ld BSEC", (long) cfd.cumcounts.bsec);
4956       fprintf (cfd.logfp, ", %9ld GBBK", (long) cfd.cumcounts.gbbk);
4957       fprintf (cfd.logfp, ", %9ld TITL", (long) cfd.cumcounts.titl);
4958       fprintf (cfd.logfp, ", %9ld PACK", (long) cfd.cumcounts.pack);
4959       fprintf (cfd.logfp, ", %9ld MOVE", (long) cfd.cumcounts.move);
4960       fprintf (cfd.logfp, ", %9ld SSEC", (long) cfd.cumcounts.ssec);
4961       fprintf (cfd.logfp, "\n");
4962       fflush (cfd.logfp);
4963     }
4964     FileClose (cfd.logfp);
4965   }
4966 
4967   if (remote || cfd.pub) {
4968     PubMedFetchDisable ();
4969   }
4970 
4971   if (remote) {
4972 #ifdef INTERNAL_NCBI_CLEANASN
4973     PUBSEQBioseqFetchDisable ();
4974 #else
4975     PubSeqFetchDisable ();
4976 #endif
4977   }
4978 
4979   return 0;
4980 }
4981 
4982 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.