|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/demo/asnval.c |
source navigation diff markup identifier search freetext search file search |
1 /* asnval.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: asnval.c
27 *
28 * Author: Jonathan Kans
29 *
30 * Version Creation Date: 11/3/04
31 *
32 * $Revision: 1.102 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45 #include <ncbi.h>
46 #include <objall.h>
47 #include <objsset.h>
48 #include <objsub.h>
49 #include <objfdef.h>
50 #include <seqport.h>
51 #include <sequtil.h>
52 #include <sqnutils.h>
53 #include <subutil.h>
54 #include <gather.h>
55 #include <explore.h>
56 #include <lsqfetch.h>
57 #include <valid.h>
58 #include <pmfapi.h>
59 #ifdef INTERNAL_NCBI_ASN2VAL
60 #include <accpubseq.h>
61 #endif
62
63 #define ASNVAL_APP_VER "7.5"
64
65 CharPtr ASNVAL_APPLICATION = ASNVAL_APP_VER;
66
67 typedef struct valflags {
68 Int2 severity;
69 Int2 lowCutoff;
70 Int2 highCutoff;
71 CharPtr errcode;
72 Boolean validateAlignments;
73 Boolean alignFindRemoteBsp;
74 Boolean doSeqHistAssembly;
75 Boolean farIDsInAlignments;
76 Boolean alwaysRequireIsoJTA;
77 Boolean farFetchCDSproducts;
78 Boolean farFetchMRNAproducts;
79 Boolean locusTagGeneralMatch;
80 Boolean validateIDSet;
81 Boolean seqSubmitParent;
82 Boolean ignoreExceptions;
83 Boolean validateExons;
84 Boolean inferenceAccnCheck;
85 Boolean testLatLonSubregion;
86 Boolean strictLatLonCountry;
87 Boolean indexerVersion;
88 Boolean automatic;
89 Boolean batch;
90 Boolean binary;
91 Boolean compressed;
92 Boolean lock;
93 Boolean useThreads;
94 Boolean usePUBSEQ;
95 Boolean validateBarcode;
96 Int2 verbosity;
97 Int2 type;
98 Int4 skipcount;
99 Int4 maxcount;
100 CharPtr outpath;
101 FILE *outfp;
102 FILE *logfp;
103 Int4 num_errors;
104 Int4 fatal_errors;
105 Boolean has_errors;
106 Boolean io_failure;
107 Char longest [64];
108 time_t worsttime;
109 Int4 numrecords;
110 Char path [PATH_MAX];
111 } ValFlagData, PNTR ValFlagPtr;
112
113 #ifdef INTERNAL_NCBI_ASN2VAL
114 static CharPtr dirsubfetchproc = "DirSubBioseqFetch";
115
116 static CharPtr dirsubfetchcmd = NULL;
117
118 extern Pointer ReadFromDirSub (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID);
119 extern Pointer ReadFromDirSub (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID)
120
121 {
122 Char cmmd [256];
123 Pointer dataptr;
124 FILE* fp;
125 Char path [PATH_MAX];
126
127 if (datatype != NULL) {
128 *datatype = 0;
129 }
130 if (entityID != NULL) {
131 *entityID = 0;
132 }
133 if (StringHasNoText (accn)) return NULL;
134
135 if (dirsubfetchcmd == NULL) {
136 if (GetAppParam ("SEQUIN", "DIRSUB", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
137 dirsubfetchcmd = StringSaveNoNull (cmmd);
138 }
139 }
140 if (dirsubfetchcmd == NULL) return NULL;
141
142 TmpNam (path);
143
144 #ifdef OS_UNIX
145 sprintf (cmmd, "csh %s %s > %s", dirsubfetchcmd, accn, path);
146 system (cmmd);
147 #endif
148 #ifdef OS_MSWIN
149 sprintf (cmmd, "%s %s -o %s", dirsubfetchcmd, accn, path);
150 system (cmmd);
151 #endif
152
153 fp = FileOpen (path, "r");
154 if (fp == NULL) {
155 FileRemove (path);
156 return NULL;
157 }
158 dataptr = ReadAsnFastaOrFlatFile (fp, datatype, entityID, FALSE, FALSE, TRUE, FALSE);
159 FileClose (fp);
160 FileRemove (path);
161 return dataptr;
162 }
163
164
165 static Int2 LIBCALLBACK DirSubBioseqFetchFunc (Pointer data)
166
167 {
168 BioseqPtr bsp;
169 Char cmmd [256];
170 Pointer dataptr;
171 Uint2 datatype;
172 Uint2 entityID;
173 FILE* fp;
174 OMProcControlPtr ompcp;
175 ObjMgrProcPtr ompp;
176 Char path [PATH_MAX];
177 SeqEntryPtr sep = NULL;
178 SeqIdPtr sip;
179 TextSeqIdPtr tsip;
180
181 ompcp = (OMProcControlPtr) data;
182 if (ompcp == NULL) return OM_MSG_RET_ERROR;
183 ompp = ompcp->proc;
184 if (ompp == NULL) return OM_MSG_RET_ERROR;
185 sip = (SeqIdPtr) ompcp->input_data;
186 if (sip == NULL) return OM_MSG_RET_ERROR;
187
188 if (sip->choice != SEQID_GENBANK) return OM_MSG_RET_ERROR;
189 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
190 if (tsip == NULL || StringHasNoText (tsip->accession)) return OM_MSG_RET_ERROR;
191
192 if (dirsubfetchcmd == NULL) {
193 if (GetAppParam ("SEQUIN", "DIRSUB", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
194 dirsubfetchcmd = StringSaveNoNull (cmmd);
195 }
196 }
197 if (dirsubfetchcmd == NULL) return OM_MSG_RET_ERROR;
198
199 TmpNam (path);
200
201 #ifdef OS_UNIX
202 sprintf (cmmd, "csh %s %s > %s", dirsubfetchcmd, tsip->accession, path);
203 system (cmmd);
204 #endif
205 #ifdef OS_MSWIN
206 sprintf (cmmd, "%s %s -o %s", dirsubfetchcmd, tsip->accession, path);
207 system (cmmd);
208 #endif
209
210 fp = FileOpen (path, "r");
211 if (fp == NULL) {
212 FileRemove (path);
213 return OM_MSG_RET_ERROR;
214 }
215 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
216 FileClose (fp);
217 FileRemove (path);
218
219 if (dataptr == NULL) return OM_MSG_RET_OK;
220
221 sep = GetTopSeqEntryForEntityID (entityID);
222 if (sep == NULL) return OM_MSG_RET_ERROR;
223 bsp = BioseqFindInSeqEntry (sip, sep);
224 ompcp->output_data = (Pointer) bsp;
225 ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
226 return OM_MSG_RET_DONE;
227 }
228
229 static Boolean DirSubFetchEnable (void)
230
231 {
232 ObjMgrProcLoad (OMPROC_FETCH, dirsubfetchproc, dirsubfetchproc,
233 OBJ_SEQID, 0, OBJ_BIOSEQ, 0, NULL,
234 DirSubBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
235 return TRUE;
236 }
237
238 static CharPtr smartfetchproc = "SmartBioseqFetch";
239
240 static CharPtr smartfetchcmd = NULL;
241
242 extern Pointer ReadFromSmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID);
243 extern Pointer ReadFromSmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID)
244
245 {
246 Char cmmd [256];
247 Pointer dataptr;
248 FILE* fp;
249 Char path [PATH_MAX];
250
251 if (datatype != NULL) {
252 *datatype = 0;
253 }
254 if (entityID != NULL) {
255 *entityID = 0;
256 }
257 if (StringHasNoText (accn)) return NULL;
258
259 if (smartfetchcmd == NULL) {
260 if (GetAppParam ("SEQUIN", "SMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
261 smartfetchcmd = StringSaveNoNull (cmmd);
262 }
263 }
264 if (smartfetchcmd == NULL) return NULL;
265
266 TmpNam (path);
267
268 #ifdef OS_UNIX
269 sprintf (cmmd, "csh %s %s > %s", smartfetchcmd, accn, path);
270 system (cmmd);
271 #endif
272 #ifdef OS_MSWIN
273 sprintf (cmmd, "%s %s -o %s", smartfetchcmd, accn, path);
274 system (cmmd);
275 #endif
276
277 fp = FileOpen (path, "r");
278 if (fp == NULL) {
279 FileRemove (path);
280 return NULL;
281 }
282 dataptr = ReadAsnFastaOrFlatFile (fp, datatype, entityID, FALSE, FALSE, TRUE, FALSE);
283 FileClose (fp);
284 FileRemove (path);
285 return dataptr;
286 }
287
288
289 static Int2 LIBCALLBACK SmartBioseqFetchFunc (Pointer data)
290
291 {
292 BioseqPtr bsp;
293 Char cmmd [256];
294 Pointer dataptr;
295 Uint2 datatype;
296 Uint2 entityID;
297 FILE* fp;
298 OMProcControlPtr ompcp;
299 ObjMgrProcPtr ompp;
300 Char path [PATH_MAX];
301 SeqEntryPtr sep = NULL;
302 SeqIdPtr sip;
303 TextSeqIdPtr tsip;
304
305 ompcp = (OMProcControlPtr) data;
306 if (ompcp == NULL) return OM_MSG_RET_ERROR;
307 ompp = ompcp->proc;
308 if (ompp == NULL) return OM_MSG_RET_ERROR;
309 sip = (SeqIdPtr) ompcp->input_data;
310 if (sip == NULL) return OM_MSG_RET_ERROR;
311
312 if (sip->choice != SEQID_GENBANK) return OM_MSG_RET_ERROR;
313 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
314 if (tsip == NULL || StringHasNoText (tsip->accession)) return OM_MSG_RET_ERROR;
315
316 if (smartfetchcmd == NULL) {
317 if (GetAppParam ("SEQUIN", "SMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
318 smartfetchcmd = StringSaveNoNull (cmmd);
319 }
320 }
321 if (smartfetchcmd == NULL) return OM_MSG_RET_ERROR;
322
323 TmpNam (path);
324
325 #ifdef OS_UNIX
326 sprintf (cmmd, "csh %s %s > %s", smartfetchcmd, tsip->accession, path);
327 system (cmmd);
328 #endif
329 #ifdef OS_MSWIN
330 sprintf (cmmd, "%s %s -o %s", smartfetchcmd, tsip->accession, path);
331 system (cmmd);
332 #endif
333
334 fp = FileOpen (path, "r");
335 if (fp == NULL) {
336 FileRemove (path);
337 return OM_MSG_RET_ERROR;
338 }
339 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
340 FileClose (fp);
341 FileRemove (path);
342
343 if (dataptr == NULL) return OM_MSG_RET_OK;
344
345 sep = GetTopSeqEntryForEntityID (entityID);
346 if (sep == NULL) return OM_MSG_RET_ERROR;
347 bsp = BioseqFindInSeqEntry (sip, sep);
348 ompcp->output_data = (Pointer) bsp;
349 ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
350 return OM_MSG_RET_DONE;
351 }
352
353 static Boolean SmartFetchEnable (void)
354
355 {
356 ObjMgrProcLoad (OMPROC_FETCH, smartfetchproc, smartfetchproc,
357 OBJ_SEQID, 0, OBJ_BIOSEQ, 0, NULL,
358 SmartBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
359 return TRUE;
360 }
361
362 static CharPtr tpasmartfetchproc = "TPASmartBioseqFetch";
363
364 static CharPtr tpasmartfetchcmd = NULL;
365
366 extern Pointer ReadFromTPASmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID);
367 extern Pointer ReadFromTPASmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID)
368
369 {
370 Char cmmd [256];
371 Pointer dataptr;
372 FILE* fp;
373 Char path [PATH_MAX];
374
375 if (datatype != NULL) {
376 *datatype = 0;
377 }
378 if (entityID != NULL) {
379 *entityID = 0;
380 }
381 if (StringHasNoText (accn)) return NULL;
382
383 if (tpasmartfetchcmd == NULL) {
384 if (GetAppParam ("SEQUIN", "TPASMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
385 tpasmartfetchcmd = StringSaveNoNull (cmmd);
386 }
387 }
388 if (tpasmartfetchcmd == NULL) return NULL;
389
390 TmpNam (path);
391
392 #ifdef OS_UNIX
393 sprintf (cmmd, "csh %s %s > %s", tpasmartfetchcmd, accn, path);
394 system (cmmd);
395 #endif
396 #ifdef OS_MSWIN
397 sprintf (cmmd, "%s %s -o %s", tpasmartfetchcmd, accn, path);
398 system (cmmd);
399 #endif
400
401 fp = FileOpen (path, "r");
402 if (fp == NULL) {
403 FileRemove (path);
404 return NULL;
405 }
406 dataptr = ReadAsnFastaOrFlatFile (fp, datatype, entityID, FALSE, FALSE, TRUE, FALSE);
407 FileClose (fp);
408 FileRemove (path);
409 return dataptr;
410 }
411
412
413 static Int2 LIBCALLBACK TPASmartBioseqFetchFunc (Pointer data)
414
415 {
416 BioseqPtr bsp;
417 Char cmmd [256];
418 Pointer dataptr;
419 Uint2 datatype;
420 Uint2 entityID;
421 FILE* fp;
422 OMProcControlPtr ompcp;
423 ObjMgrProcPtr ompp;
424 Char path [PATH_MAX];
425 SeqEntryPtr sep = NULL;
426 SeqIdPtr sip;
427 TextSeqIdPtr tsip;
428
429 ompcp = (OMProcControlPtr) data;
430 if (ompcp == NULL) return OM_MSG_RET_ERROR;
431 ompp = ompcp->proc;
432 if (ompp == NULL) return OM_MSG_RET_ERROR;
433 sip = (SeqIdPtr) ompcp->input_data;
434 if (sip == NULL) return OM_MSG_RET_ERROR;
435
436 if (sip->choice != SEQID_TPG) return OM_MSG_RET_ERROR;
437 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
438 if (tsip == NULL || StringHasNoText (tsip->accession)) return OM_MSG_RET_ERROR;
439
440 if (tpasmartfetchcmd == NULL) {
441 if (GetAppParam ("SEQUIN", "TPASMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
442 tpasmartfetchcmd = StringSaveNoNull (cmmd);
443 }
444 }
445 if (tpasmartfetchcmd == NULL) return OM_MSG_RET_ERROR;
446
447 TmpNam (path);
448
449 #ifdef OS_UNIX
450 sprintf (cmmd, "csh %s %s > %s", tpasmartfetchcmd, tsip->accession, path);
451 system (cmmd);
452 #endif
453 #ifdef OS_MSWIN
454 sprintf (cmmd, "%s %s -o %s", tpasmartfetchcmd, tsip->accession, path);
455 system (cmmd);
456 #endif
457
458 fp = FileOpen (path, "r");
459 if (fp == NULL) {
460 FileRemove (path);
461 return OM_MSG_RET_ERROR;
462 }
463 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
464 FileClose (fp);
465 FileRemove (path);
466
467 if (dataptr == NULL) return OM_MSG_RET_OK;
468
469 sep = GetTopSeqEntryForEntityID (entityID);
470 if (sep == NULL) return OM_MSG_RET_ERROR;
471 bsp = BioseqFindInSeqEntry (sip, sep);
472 ompcp->output_data = (Pointer) bsp;
473 ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
474 return OM_MSG_RET_DONE;
475 }
476
477 static Boolean TPASmartFetchEnable (void)
478
479 {
480 ObjMgrProcLoad (OMPROC_FETCH, tpasmartfetchproc, tpasmartfetchproc,
481 OBJ_SEQID, 0, OBJ_BIOSEQ, 0, NULL,
482 TPASmartBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
483 return TRUE;
484 }
485 #endif
486
487 static ValNodePtr DoLockFarComponents (
488 SeqEntryPtr sep,
489 ValFlagPtr vfp
490 )
491
492 {
493 Boolean farFetch;
494 ValNodePtr rsult;
495 time_t start_time, stop_time;
496
497 start_time = GetSecs ();
498
499 #ifdef INTERNAL_NCBI_ASN2VAL
500 if (vfp->useThreads) {
501 Message (MSG_POST, "Threads will not be used in this executable");
502 vfp->useThreads = FALSE;;
503 }
504 #endif
505
506 farFetch = (Boolean) (vfp->farFetchCDSproducts);
507
508 if (NlmThreadsAvailable () && vfp->useThreads) {
509 rsult = AdvcLockFarComponents (sep, TRUE, farFetch, farFetch, NULL, TRUE);
510 } else if (vfp->useThreads) {
511 Message (MSG_POST, "Threads not available in this executable");
512 rsult = AdvcLockFarComponents (sep, TRUE, farFetch, farFetch, NULL, FALSE);
513 } else {
514 rsult = AdvcLockFarComponents (sep, TRUE, farFetch, farFetch, NULL, FALSE);
515 }
516
517 stop_time = GetSecs ();
518
519 return rsult;
520 }
521
522 static CharPtr severityLabel [] = {
523 "NONE", "INFO", "WARNING", "ERROR", "REJECT", "FATAL", "MAX", NULL
524 };
525
526 static CharPtr compatSeverityLabel [] = {
527 "NONE", "NOTE: valid", "WARNING: valid", "ERROR: valid", "REJECT: valid", "FATAL: valid", "MAX", NULL
528 };
529
530 typedef struct vcdaa {
531 FILE *ofp;
532 Int2 verbosity;
533 Int2 lowCutoff;
534 Int2 highCutoff;
535 CharPtr errcode;
536 ValFlagPtr vfp;
537 } VCData, PNTR VCPtr;
538
539 static void XmlEncode (CharPtr dst, CharPtr src)
540
541 {
542 Char ch;
543
544 if (dst == NULL || src == NULL) return;
545
546 ch = *src;
547 while (ch != '\0') {
548 if (ch == '<') {
549 *dst = '&';
550 dst++;
551 *dst = 'l';
552 dst++;
553 *dst = 't';
554 dst++;
555 *dst = ';';
556 dst++;
557 } else if (ch == '>') {
558 *dst = '&';
559 dst++;
560 *dst = 'g';
561 dst++;
562 *dst = 't';
563 dst++;
564 *dst = ';';
565 dst++;
566 } else {
567 *dst = ch;
568 dst++;
569 }
570 src++;
571 ch = *src;
572 }
573 *dst = '\0';
574 }
575
576
577 static CharPtr GetXmlHeaderText (ErrSev cutoff)
578 {
579 CharPtr xml_header = NULL;
580 CharPtr xml_4_fmt = "asnval version=\"%s\" severity_cutoff=\"%s\"";
581
582 xml_header = (CharPtr) MemNew (sizeof (Char) * (10 + StringLen (xml_4_fmt) +
583 StringLen (ASNVAL_APPLICATION) + StringLen (severityLabel[cutoff])));
584 sprintf (xml_header, xml_4_fmt, ASNVAL_APPLICATION, severityLabel[cutoff]);
585 return xml_header;
586 }
587
588
589 static void LIBCALLBACK ValidCallback (
590 ErrSev severity,
591 int errcode,
592 int subcode,
593 Uint2 entityID,
594 Uint2 itemtype,
595 Uint4 itemID,
596 CharPtr accession,
597 CharPtr featureID,
598 CharPtr message,
599 CharPtr objtype,
600 CharPtr label,
601 CharPtr context,
602 CharPtr location,
603 CharPtr product,
604 Pointer userdata
605 )
606
607 {
608 Char buf [256];
609 CharPtr catname, errname, urlmssg = NULL;
610 ErrSev cutoff;
611 FILE *fp;
612 size_t len;
613 VCPtr vcp;
614 ValFlagPtr vfp;
615 CharPtr xml_header;
616
617 vcp = (VCPtr) userdata;
618 if (vcp == NULL) return;
619 fp = vcp->ofp;
620 if (fp == NULL) return;
621 vfp = vcp->vfp;
622 if (vfp == NULL) return;
623
624 if (severity < SEV_NONE || severity > SEV_MAX) {
625 severity = SEV_MAX;
626 }
627
628 if (severity < vcp->lowCutoff || severity > vcp->highCutoff) return;
629
630 catname = GetValidCategoryName (errcode);
631 errname = GetValidErrorName (errcode, subcode);
632
633 if (catname == NULL) {
634 catname = "?";
635 }
636 if (errname == NULL) {
637 errname = "?";
638 }
639
640 if (StringDoesHaveText (vcp->errcode)) {
641 if (StringICmp (vcp->errcode, errname) != 0) return;
642 }
643
644 if (accession == NULL) {
645 accession = "";
646 }
647 if (featureID == NULL) {
648 featureID = "";
649 }
650 if (message == NULL) {
651 message = "";
652 }
653 if (objtype == NULL) {
654 objtype = "";
655 }
656 if (label == NULL) {
657 label = "";
658 }
659
660 if (vcp->verbosity == 1) {
661
662 fprintf (fp, "%s [%s.%s] %s %s: %s",
663 compatSeverityLabel [severity],
664 catname, errname, message, objtype, label);
665 if (StringDoesHaveText (featureID)) {
666 fprintf (fp, " <%s>", featureID);
667 }
668 if (location != NULL) {
669 fprintf (fp, " %s", location);
670 }
671 if (context != NULL) {
672 fprintf (fp, " %s", context);
673 }
674 if (product != NULL) {
675 fprintf (fp, " -> %s", product);
676 }
677 fprintf (fp, "\n");
678
679 } else if (vcp->verbosity == 2) {
680
681 StringCpy (buf, accession);
682 StringCat (buf, " ");
683 buf [15] = '\0';
684
685 StringCat (buf, severityLabel [severity]);
686 StringCat (buf, " ");
687 buf [30] = '\0';
688
689 StringCat (buf, catname);
690 StringCat (buf, "_");
691 StringCat (buf, errname);
692
693 fprintf (fp, "%s\n", buf);
694
695 } else if (vcp->verbosity == 3) {
696
697 fprintf (fp, "%s\t%s\t%s_%s\n",
698 accession, severityLabel [severity],
699 catname, errname);
700
701 } else if (vcp->verbosity == 4) {
702
703 if (! vfp->has_errors) {
704 cutoff = (ErrSev) vcp->lowCutoff;
705 if (cutoff < SEV_NONE || cutoff > SEV_MAX) {
706 cutoff = SEV_MAX;
707 }
708
709 xml_header = GetXmlHeaderText (cutoff);
710 fprintf (fp, "<%s>\n", xml_header);
711 xml_header = MemFree (xml_header);
712 }
713
714 len = StringLen (message);
715 if (len > 0) {
716 urlmssg = MemNew (len * 3 + 2);
717 if (urlmssg != NULL) {
718 XmlEncode (urlmssg, message);
719 if (StringDoesHaveText (featureID)) {
720 fprintf (fp, " <message severity=\"%s\" seq-id=\"%s\" feat-id=\"%s\" code=\"%s_%s\">%s</message>\n",
721 severityLabel [severity], accession, featureID, catname, errname, urlmssg);
722 } else {
723 fprintf (fp, " <message severity=\"%s\" seq-id=\"%s\" code=\"%s_%s\">%s</message>\n",
724 severityLabel [severity], accession, catname, errname, urlmssg);
725 }
726 MemFree (urlmssg);
727 }
728 }
729 }
730
731 vfp->has_errors = TRUE;
732 }
733
734 static void DoValidation (
735 SeqEntryPtr sep,
736 ValFlagPtr vfp,
737 FILE *ofp
738 )
739
740 {
741 Int2 i;
742 VCData vcd;
743 ValidStructPtr vsp;
744 ErrSev cutoff;
745 CharPtr xml_header = NULL;
746
747 if (vfp == NULL) return;
748
749 vsp = ValidStructNew ();
750 if (vsp == NULL) return;
751
752 MemSet ((Pointer) &vcd, 0, sizeof (VCData));
753
754 vsp->useSeqMgrIndexes = TRUE;
755
756 vsp->cutoff = vfp->lowCutoff;
757 vsp->validateAlignments = vfp->validateAlignments;
758 vsp->alignFindRemoteBsp = vfp->alignFindRemoteBsp;
759 vsp->doSeqHistAssembly = vfp->doSeqHistAssembly;
760 vsp->farIDsInAlignments = vfp->farIDsInAlignments;
761 vsp->alwaysRequireIsoJTA = vfp->alwaysRequireIsoJTA;
762 vsp->farFetchCDSproducts = vfp->farFetchCDSproducts;
763 vsp->farFetchMRNAproducts = vfp->farFetchMRNAproducts;
764 vsp->locusTagGeneralMatch = vfp->locusTagGeneralMatch;
765 vsp->validateIDSet = vfp->validateIDSet;
766 vsp->seqSubmitParent = vfp->seqSubmitParent;
767 vsp->ignoreExceptions = vfp->ignoreExceptions;
768 vsp->validateExons = vfp->validateExons;
769 vsp->inferenceAccnCheck = vfp->inferenceAccnCheck;
770 vsp->testLatLonSubregion = vfp->testLatLonSubregion;
771 vsp->strictLatLonCountry = vfp->strictLatLonCountry;
772 vsp->indexerVersion = vfp->indexerVersion;
773
774 if (ofp == NULL && vfp->outfp != NULL) {
775 ofp = vfp->outfp;
776 }
777 if (ofp != NULL) {
778 vcd.ofp = ofp;
779 vcd.verbosity = vfp->verbosity;
780 vcd.lowCutoff = vfp->lowCutoff;
781 vcd.highCutoff = vfp->highCutoff;
782 vcd.errcode = vfp->errcode;
783 vcd.vfp = vfp;
784 vsp->errfunc = ValidCallback;
785 vsp->userdata = (Pointer) &vcd;
786 vsp->convertGiToAccn = FALSE;
787 }
788
789 ValidateSeqEntry (sep, vsp);
790
791 for (i = 0; i <= 4; i++) {
792 vfp->num_errors += vsp->errors [i];
793 if (i >= vfp->severity) {
794 vfp->fatal_errors += vsp->errors [i];
795 }
796 }
797
798 ValidStructFree (vsp);
799 if (vfp->validateBarcode) {
800 if (vfp->verbosity == 4 && !vfp->has_errors) {
801 cutoff = (ErrSev) vfp->lowCutoff;
802 if (cutoff < SEV_NONE || cutoff > SEV_MAX) {
803 cutoff = SEV_MAX;
804 }
805 xml_header = GetXmlHeaderText(cutoff);
806 }
807 if (!BarcodeValidateOneSeqEntry (ofp, sep, TRUE,
808 vfp->verbosity == 4,
809 !vfp->has_errors,
810 xml_header)) {
811 vfp->has_errors = TRUE;
812 }
813 xml_header = MemFree (xml_header);
814 }
815 }
816
817 static void ProcessSingleRecord (
818 CharPtr filename,
819 ValFlagPtr vfp
820 )
821
822 {
823 AsnIoPtr aip;
824 BioseqPtr bsp;
825 ValNodePtr bsplist;
826 BioseqSetPtr bssp;
827 Char buf [64], path [PATH_MAX];
828 Pointer dataptr = NULL;
829 Uint2 datatype = 0, entityID = 0;
830 FILE *fp, *ofp = NULL;
831 SeqEntryPtr fsep, sep;
832 ObjMgrPtr omp;
833 CharPtr ptr;
834 time_t starttime, stoptime;
835
836 if (StringHasNoText (filename)) return;
837 if (vfp == NULL) return;
838
839 if (vfp->type == 1) {
840 fp = FileOpen (filename, "r");
841 if (fp == NULL) {
842 Message (MSG_POSTERR, "Failed to open '%s'", filename);
843 return;
844 }
845
846 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE);
847
848 FileClose (fp);
849
850 entityID = ObjMgrRegister (datatype, dataptr);
851
852 } else if (vfp->type >= 2 && vfp->type <= 5) {
853 aip = AsnIoOpen (filename, vfp->binary? "rb" : "r");
854 if (aip == NULL) {
855 Message (MSG_POSTERR, "AsnIoOpen failed for input file '%s'", filename);
856 return;
857 }
858
859 SeqMgrHoldIndexing (TRUE);
860 switch (vfp->type) {
861 case 2 :
862 dataptr = (Pointer) SeqEntryAsnRead (aip, NULL);
863 datatype = OBJ_SEQENTRY;
864 break;
865 case 3 :
866 dataptr = (Pointer) BioseqAsnRead (aip, NULL);
867 datatype = OBJ_BIOSEQ;
868 break;
869 case 4 :
870 dataptr = (Pointer) BioseqSetAsnRead (aip, NULL);
871 datatype = OBJ_BIOSEQSET;
872 break;
873 case 5 :
874 dataptr = (Pointer) SeqSubmitAsnRead (aip, NULL);
875 datatype = OBJ_SEQSUB;
876 break;
877 default :
878 break;
879 }
880 SeqMgrHoldIndexing (FALSE);
881
882 AsnIoClose (aip);
883
884 entityID = ObjMgrRegister (datatype, dataptr);
885
886 } else {
887 Message (MSG_POSTERR, "Input format type '%d' unrecognized", (int) vfp->type);
888 return;
889 }
890
891 if (entityID < 1 || dataptr == NULL) {
892 Message (MSG_POSTERR, "Data read failed for input file '%s'", filename);
893 return;
894 }
895
896 if (datatype == OBJ_SEQSUB || datatype == OBJ_SEQENTRY ||
897 datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) {
898
899 sep = GetTopSeqEntryForEntityID (entityID);
900
901 if (sep == NULL) {
902 sep = SeqEntryNew ();
903 if (sep != NULL) {
904 if (datatype == OBJ_BIOSEQ) {
905 bsp = (BioseqPtr) dataptr;
906 sep->choice = 1;
907 sep->data.ptrvalue = bsp;
908 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
909 } else if (datatype == OBJ_BIOSEQSET) {
910 bssp = (BioseqSetPtr) dataptr;
911 sep->choice = 2;
912 sep->data.ptrvalue = bssp;
913 SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
914 } else {
915 sep = SeqEntryFree (sep);
916 }
917 }
918 sep = GetTopSeqEntryForEntityID (entityID);
919 }
920
921 if (sep != NULL) {
922
923 starttime = GetSecs ();
924 buf [0] = '\0';
925
926 if (vfp->logfp != NULL) {
927 fsep = FindNthBioseq (sep, 1);
928 if (fsep != NULL && fsep->choice == 1) {
929 bsp = (BioseqPtr) fsep->data.ptrvalue;
930 if (bsp != NULL) {
931 SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf));
932 fprintf (vfp->logfp, "%s\n", buf);
933 fflush (vfp->logfp);
934 }
935 }
936 }
937
938 StringNCpy_0 (path, filename, sizeof (path));
939 ptr = StringRChr (path, '.');
940 if (ptr != NULL) {
941 *ptr = '\0';
942 }
943 StringCat (path, ".val");
944
945 if (vfp->outpath != NULL) {
946 ErrSetLogfile (vfp->outpath, ELOG_APPEND);
947 } else if (vfp->verbosity == 0) {
948 ErrSetLogfile (path, ELOG_APPEND);
949 } else if (vfp->outfp == NULL) {
950 ofp = FileOpen (path, "w");
951 }
952
953 bsplist = NULL;
954
955 if (vfp->inferenceAccnCheck) {
956 LookupFarSeqIDs (sep, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE);
957 }
958 if (vfp->lock) {
959 bsplist = DoLockFarComponents (sep, vfp);
960 }
961
962 DoValidation (sep, vfp, ofp);
963
964 bsplist = UnlockFarComponents (bsplist);
965
966 if (ofp != NULL) {
967 if (vfp->has_errors) {
968 if (vfp->verbosity == 4) {
969 fprintf (ofp, "</asnval>\n");
970 }
971 vfp->has_errors = FALSE;
972 }
973 FileClose (ofp);
974 }
975
976 stoptime = GetSecs ();
977 if (stoptime - starttime > vfp->worsttime && StringDoesHaveText (buf)) {
978 vfp->worsttime = stoptime - starttime;
979 StringCpy (vfp->longest, buf);
980 }
981 (vfp->numrecords)++;
982 }
983 } else {
984 Message (MSG_POSTERR, "Datatype %d not recognized", (int) datatype);
985 }
986
987 ObjMgrFree (datatype, dataptr);
988
989 omp = ObjMgrGet ();
990 ObjMgrReapOne (omp);
991 SeqMgrClearBioseqIndex ();
992 ObjMgrFreeCache (0);
993 FreeSeqIdGiCache ();
994
995 SeqEntrySetScope (NULL);
996 }
997
998 static void ProcessMultipleRecord (
999 CharPtr filename,
1000 ValFlagPtr vfp
1001 )
1002
1003 {
1004 AsnIoPtr aip;
1005 AsnModulePtr amp;
1006 AsnTypePtr atp, atp_bss, atp_desc, atp_sbp, atp_se = NULL, atp_ssp;
1007 BioseqPtr bsp;
1008 ValNodePtr bsplist;
1009 BioseqSetPtr bssp;
1010 Char buf [64], path [PATH_MAX], longest [64];
1011 Int2 skipcount = 0, maxcount = 0;
1012 CitSubPtr csp = NULL;
1013 FILE *fp, *ofp = NULL;
1014 Int4 numrecords = 0;
1015 SeqEntryPtr fsep, sep;
1016 ObjMgrPtr omp;
1017 ObjValNode ovn;
1018 Pubdesc pd;
1019 CharPtr ptr;
1020 SubmitBlockPtr sbp = NULL;
1021 time_t starttime, stoptime, worsttime;
1022 SeqDescrPtr subcit = NULL;
1023 ValNode vn;
1024 #ifdef OS_UNIX
1025 Char cmmd [256];
1026 Boolean detailed_report = FALSE;
1027 CharPtr gzcatprog;
1028 Boolean memory_usage = FALSE;
1029 int ret;
1030 Boolean usedPopen = FALSE;
1031 #endif
1032
1033 if (StringHasNoText (filename)) return;
1034 if (vfp == NULL) return;
1035
1036 #ifndef OS_UNIX
1037 if (vfp->compressed) {
1038 Message (MSG_POSTERR, "Can only decompress on-the-fly on UNIX machines");
1039 return;
1040 }
1041 #endif
1042
1043 amp = AsnAllModPtr ();
1044 if (amp == NULL) {
1045 Message (MSG_POSTERR, "Unable to load AsnAllModPtr");
1046 return;
1047 }
1048
1049 atp_ssp = AsnFind ("Seq-submit");
1050 if (atp_ssp == NULL) {
1051 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit");
1052 return;
1053 }
1054
1055 atp_sbp = AsnFind ("Seq-submit.sub");
1056 if (atp_sbp == NULL) {
1057 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.sub");
1058 return;
1059 }
1060
1061 atp_bss = AsnFind ("Bioseq-set");
1062 if (atp_bss == NULL) {
1063 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set");
1064 return;
1065 }
1066
1067 atp_desc = AsnFind ("Bioseq-set.descr");
1068 if (atp_desc == NULL) {
1069 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.descr");
1070 return;
1071 }
1072
1073 if (vfp->type == 4) {
1074 atp_se = AsnFind ("Bioseq-set.seq-set.E");
1075 if (atp_se == NULL) {
1076 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
1077 return;
1078 }
1079 } else if (vfp->type == 5) {
1080 atp_se = AsnFind ("Seq-submit.data.entrys.E");
1081 if (atp_se == NULL) {
1082 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys.E");
1083 return;
1084 }
1085 } else {
1086 Message (MSG_POSTERR, "Batch processing type not set properly");
1087 return;
1088 }
1089
1090 if (atp_se == NULL) {
1091 Message (MSG_POSTERR, "Unable to find ASN.1 type for atp_se");
1092 return;
1093 }
1094
1095 #ifdef OS_UNIX
1096 if (getenv ("ASNVAL_LOG_OBJMGR_REPORT") != NULL) {
1097 detailed_report = TRUE;
1098 }
1099 if (getenv ("ASNVAL_LOG_MEMORY_REPORT") != NULL) {
1100 memory_usage = TRUE;
1101 }
1102
1103 if (vfp->compressed) {
1104 gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
1105 if (gzcatprog != NULL) {
1106 sprintf (cmmd, "%s %s", gzcatprog, filename);
1107 } else {
1108 ret = system ("gzcat -h >/dev/null 2>&1");
1109 if (ret == 0) {
1110 sprintf (cmmd, "gzcat %s", filename);
1111 } else if (ret == -1) {
1112 Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
1113 return;
1114 } else {
1115 ret = system ("zcat -h >/dev/null 2>&1");
1116 if (ret == 0) {
1117 sprintf (cmmd, "zcat %s", filename);
1118 } else if (ret == -1) {
1119 Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease");
1120 return;
1121 } else {
1122 Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
1123 return;
1124 }
1125 }
1126 }
1127 fp = popen (cmmd, /* vfp->binary? "rb" : */ "r");
1128 usedPopen = TRUE;
1129 } else {
1130 fp = FileOpen (filename, vfp->binary? "rb" : "r");
1131 }
1132 #else
1133 fp = FileOpen (filename, vfp->binary? "rb" : "r");
1134 #endif
1135 if (fp == NULL) {
1136 Message (MSG_POSTERR, "FileOpen failed for input file '%s'", filename);
1137 return;
1138 }
1139
1140 aip = AsnIoNew (vfp->binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
1141 if (aip == NULL) {
1142 Message (MSG_POSTERR, "AsnIoNew failed for input file '%s'", filename);
1143 return;
1144 }
1145
1146 if (vfp->type == 4) {
1147 atp = atp_bss;
1148 } else if (vfp->type == 5) {
1149 atp = atp_ssp;
1150 } else {
1151 Message (MSG_POSTERR, "Batch processing type not set properly");
1152 return;
1153 }
1154
1155 longest [0] = '\0';
1156 worsttime = 0;
1157
1158 StringNCpy_0 (path, filename, sizeof (path));
1159 ptr = StringRChr (path, '.');
1160 if (ptr != NULL) {
1161 *ptr = '\0';
1162 }
1163 StringCat (path, ".val");
1164
1165 if (vfp->outpath != NULL) {
1166 ErrSetLogfile (vfp->outpath, ELOG_APPEND);
1167 } else if (vfp->verbosity == 0) {
1168 ErrSetLogfile (path, ELOG_APPEND);
1169 } else if (vfp->outfp == NULL) {
1170 ofp = FileOpen (path, "w");
1171 }
1172
1173 while ((! vfp->io_failure) && maxcount < vfp->maxcount &&
1174 (atp = AsnReadId (aip, amp, atp)) != NULL) {
1175 if (aip->io_failure) {
1176 vfp->io_failure = TRUE;
1177 aip->io_failure = FALSE;
1178 }
1179 if (atp == atp_se) {
1180
1181 SeqMgrHoldIndexing (TRUE);
1182 sep = SeqEntryAsnRead (aip, atp);
1183 SeqMgrHoldIndexing (FALSE);
1184
1185 /* propagate submission citation as descriptor onto each Seq-entry */
1186
1187 if (subcit != NULL && sep != NULL && sep->data.ptrvalue != NULL) {
1188 if (sep->choice == 1) {
1189 bsp = (BioseqPtr) sep->data.ptrvalue;
1190 ValNodeLink (&(bsp->descr),
1191 AsnIoMemCopy ((Pointer) subcit,
1192 (AsnReadFunc) SeqDescrAsnRead,
1193 (AsnWriteFunc) SeqDescrAsnWrite));
1194 } else if (sep->choice == 2) {
1195 bssp = (BioseqSetPtr) sep->data.ptrvalue;
1196 ValNodeLink (&(bssp->descr),
1197 AsnIoMemCopy ((Pointer) subcit,
1198 (AsnReadFunc) SeqDescrAsnRead,
1199 (AsnWriteFunc) SeqDescrAsnWrite));
1200 }
1201 }
1202
1203 if (sep != NULL) {
1204 if (skipcount < vfp->skipcount) {
1205 skipcount++;
1206 } else {
1207
1208 starttime = GetSecs ();
1209 buf [0] = '\0';
1210
1211 if (vfp->logfp != NULL) {
1212 fsep = FindNthBioseq (sep, 1);
1213 if (fsep != NULL && fsep->choice == 1) {
1214 bsp = (BioseqPtr) fsep->data.ptrvalue;
1215 if (bsp != NULL) {
1216 SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf));
1217 fprintf (vfp->logfp, "%s\n", buf);
1218 fflush (vfp->logfp);
1219 }
1220 }
1221 }
1222
1223 bsplist = NULL;
1224
1225 if (vfp->inferenceAccnCheck) {
1226 LookupFarSeqIDs (sep, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE);
1227 }
1228 if (vfp->lock) {
1229 bsplist = DoLockFarComponents (sep, vfp);
1230 }
1231
1232 DoValidation (sep, vfp, ofp);
1233
1234 bsplist = UnlockFarComponents (bsplist);
1235
1236 stoptime = GetSecs ();
1237 if (stoptime - starttime > worsttime && StringDoesHaveText (buf)) {
1238 worsttime = stoptime - starttime;
1239 StringCpy (longest, buf);
1240 }
1241 numrecords++;
1242 maxcount++;
1243 }
1244 }
1245
1246 SeqEntryFree (sep);
1247 omp = ObjMgrGet ();
1248 ObjMgrReapOne (omp);
1249 SeqMgrClearBioseqIndex ();
1250 ObjMgrFreeCache (0);
1251 FreeSeqIdGiCache ();
1252
1253 SeqEntrySetScope (NULL);
1254
1255 #ifdef OS_UNIX
1256 if (detailed_report && vfp->logfp != NULL) {
1257 ObjMgrReportProc (vfp->logfp);
1258 }
1259
1260 if (memory_usage && vfp->logfp != NULL) {
1261 Char mbuf [512];
1262 FILE *mufp;
1263 Char ch;
1264 Int4 len;
1265 CharPtr ptr1, ptr2;
1266 Int2 spaces;
1267 uid_t uid;
1268 uid = getpid ();
1269 sprintf (cmmd, "cat /proc/%d/stat", (int) uid);
1270 mufp = popen (cmmd, "r");
1271 if (mufp != NULL) {
1272 len = FileRead ((Pointer) mbuf, sizeof (Char), sizeof (mbuf), mufp);
1273 if (len > 0) {
1274 mbuf [(int) len] = '\0';
1275 ptr1 = mbuf;
1276 ch = *ptr1;
1277 spaces = 0;
1278 while (ch != '\0' && spaces < 22) {
1279 if (ch == ' ') {
1280 spaces++;
1281 }
1282 ptr1++;
1283 ch = *ptr1;
1284 }
1285 if (ch != '\0') {
1286 ptr2 = StringChr (ptr1, ' ');
1287 if (ptr2 != NULL) {
1288 *ptr2 = '\0';
1289 fprintf (vfp->logfp, "Memory usage %s\n", ptr1);
1290 }
1291 }
1292 }
1293 pclose (mufp);
1294 }
1295 }
1296 #endif
1297
1298 } else if (atp == atp_sbp) {
1299 sbp = SubmitBlockAsnRead (aip, atp);
1300 if (sbp != NULL) {
1301 csp = sbp->cit;
1302 if (csp != NULL) {
1303 MemSet ((Pointer) &ovn, 0, sizeof (ObjValNode));
1304 MemSet ((Pointer) &pd, 0, sizeof (Pubdesc));
1305 MemSet ((Pointer) &vn, 0, sizeof (ValNode));
1306 vn.choice = PUB_Sub;
1307 vn.data.ptrvalue = (Pointer) csp;
1308 vn.next = NULL;
1309 pd.pub = &vn;
1310 ovn.vn.choice = Seq_descr_pub;
1311 ovn.vn.data.ptrvalue = (Pointer) &pd;
1312 ovn.vn.next = NULL;
1313 ovn.vn.extended = 1;
1314 subcit = (SeqDescrPtr) &ovn;
1315 }
1316 }
1317 } else {
1318 AsnReadVal (aip, atp, NULL);
1319 }
1320
1321 if (aip->io_failure) {
1322 vfp->io_failure = TRUE;
1323 aip->io_failure = FALSE;
1324 }
1325 }
1326
1327 if (aip->io_failure) {
1328 vfp->io_failure = TRUE;
1329 }
1330
1331 if (vfp->io_failure) {
1332 Message (MSG_POSTERR, "Asn io_failure for input file '%s'", filename);
1333 }
1334
1335 if (ofp != NULL) {
1336 if (vfp->has_errors) {
1337 if (vfp->verbosity == 4) {
1338 fprintf (ofp, "</asnval>\n");
1339 }
1340 vfp->has_errors = FALSE;
1341 }
1342 FileClose (ofp);
1343 }
1344
1345 AsnIoFree (aip, FALSE);
1346
1347 #ifdef OS_UNIX
1348 if (usedPopen) {
1349 pclose (fp);
1350 } else {
1351 FileClose (fp);
1352 }
1353 #else
1354 FileClose (fp);
1355 #endif
1356
1357 if (vfp->logfp != NULL && (! StringHasNoText (longest))) {
1358 fprintf (vfp->logfp, "Longest processing time %ld seconds on %s\n",
1359 (long) worsttime, longest);
1360 fprintf (vfp->logfp, "Total number of records %ld\n", (long) numrecords);
1361 fflush (vfp->logfp);
1362 }
1363 }
1364
1365 static void ValidWrapper (
1366 SeqEntryPtr sep,
1367 Pointer userdata
1368 )
1369
1370 {
1371 BioseqPtr bsp;
1372 ValNodePtr bsplist;
1373 Char buf [64];
1374 SeqEntryPtr fsep;
1375 FILE *ofp = NULL;
1376 CharPtr ptr;
1377 ErrSev sev;
1378 time_t starttime, stoptime;
1379 ValFlagPtr vfp;
1380
1381 if (sep == NULL) return;
1382 vfp = (ValFlagPtr) userdata;
1383 if (vfp == NULL) return;
1384
1385 starttime = GetSecs ();
1386 buf [0] = '\0';
1387
1388 if (vfp->logfp != NULL) {
1389 fsep = FindNthBioseq (sep, 1);
1390 if (fsep != NULL && fsep->choice == 1) {
1391 bsp = (BioseqPtr) fsep->data.ptrvalue;
1392 if (bsp != NULL) {
1393 SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf));
1394 fprintf (vfp->logfp, "%s\n", buf);
1395 fflush (vfp->logfp);
1396 }
1397 }
1398 }
1399
1400 ptr = StringRChr (vfp->path, '.');
1401 if (ptr != NULL) {
1402 *ptr = '\0';
1403 }
1404 StringCat (vfp->path, ".val");
1405
1406 if (vfp->outpath != NULL) {
1407 ErrSetLogfile (vfp->outpath, ELOG_APPEND);
1408 } else if (vfp->verbosity == 0) {
1409 ErrSetLogfile (vfp->path, ELOG_APPEND);
1410 } else if (vfp->outfp == NULL) {
1411 ofp = FileOpen (vfp->path, "w");
1412 }
1413
1414 bsplist = NULL;
1415
1416 sev = ErrSetMessageLevel (SEV_WARNING);
1417 if (vfp->inferenceAccnCheck) {
1418 LookupFarSeqIDs (sep, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE);
1419 }
1420 if (vfp->lock) {
1421 bsplist = DoLockFarComponents (sep, vfp);
1422 }
1423 ErrSetMessageLevel (sev);
1424
1425 DoValidation (sep, vfp, ofp);
1426
1427 bsplist = UnlockFarComponents (bsplist);
1428
1429 if (ofp != NULL) {
1430 if (vfp->has_errors) {
1431 if (vfp->verbosity == 4) {
1432 fprintf (ofp, "</asnval>\n");
1433 }
1434 vfp->has_errors = FALSE;
1435 }
1436 FileClose (ofp);
1437 }
1438
1439 stoptime = GetSecs ();
1440 if (stoptime - starttime > vfp->worsttime && StringDoesHaveText (buf)) {
1441 vfp->worsttime = stoptime - starttime;
1442 StringCpy (vfp->longest, buf);
1443 }
1444 (vfp->numrecords)++;
1445 }
1446
1447 static void ProcessOneRecord (
1448 CharPtr filename,
1449 Pointer userdata
1450 )
1451
1452 {
1453 ValFlagPtr vfp;
1454
1455 vfp = (ValFlagPtr) userdata;
1456 if (vfp == NULL) return;
1457
1458 if (vfp->logfp != NULL) {
1459 fprintf (vfp->logfp, "%s\n", filename);
1460 fflush (vfp->logfp);
1461 }
1462
1463 if (vfp->automatic) {
1464 StringNCpy_0 (vfp->path, filename, sizeof (vfp->path));
1465 ReadSequenceAsnFile (filename, vfp->binary, vfp->compressed, (Pointer) vfp, ValidWrapper);
1466 } else if (vfp->batch) {
1467 ProcessMultipleRecord (filename, vfp);
1468 } else {
1469 ProcessSingleRecord (filename, vfp);
1470 }
1471 }
1472
1473 /* Args structure contains command-line arguments */
1474
1475 #define p_argInputPath 0
1476 #define i_argInputFile 1
1477 #define o_argOutputFile 2
1478 #define x_argSuffix 3
1479 #define u_argRecurse 4
1480 #define R_argSeverity 5
1481 #define Q_argLowCutoff 6
1482 #define P_argHighCutoff 7
1483 #define E_argOnlyThisErr 8
1484 #define A_argAlignments 9
1485 #define J_argIsoJta 10
1486 #define Z_argRemoteCDS 11
1487 #define X_argExonSplice 12
1488 #define G_argInfAccns 13
1489 #define N_argLatLonStrict 14
1490 #define M_argMatchTag 15
1491 #define Y_argCheckOld 16
1492 #define e_argIgnoreExcept 17
1493 #define v_argVerbosity 18
1494 #define a_argType 19
1495 #define b_argBinary 20
1496 #define c_argCompressed 21
1497 #define r_argRemote 22
1498 #define k_argLocalFetch 23
1499 #define d_argAsnIdx 24
1500 #define l_argLockFar 25
1501 #define T_argThreads 26
1502 #define L_argLogFile 27
1503 #define K_argSummmary 28
1504 #define S_argSkipCount 29
1505 #define B_argBarcodeVal 30
1506 #define C_argMaxCount 31
1507 #ifdef INTERNAL_NCBI_ASN2VAL
1508 #define w_argSeqSubParent 32
1509 #define H_argAccessHUP 33
1510 #define y_argAIndexer 34
1511 #endif
1512
1513 #define LAT_LON_STATE 1
1514 #define LAT_LON_STRICT 2
1515
1516 Args myargs [] = {
1517 {"Path to ASN.1 Files", NULL, NULL, NULL,
1518 TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
1519 {"Single Input File", "stdin", NULL, NULL,
1520 TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
1521 {"Single Output File", NULL, NULL, NULL,
1522 TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
1523 {"File Selection Substring", ".ent", NULL, NULL,
1524 TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
1525 {"Recurse", "F", NULL, NULL,
1526 TRUE, 'u', ARG_BOOLEAN, 0.0, 0, NULL},
1527 {"Severity for Error in Return Code", "4", "0", "6",
1528 FALSE, 'R', ARG_INT, 0.0, 0, NULL},
1529 {"Lowest Severity for Error to Show", "3", "0", "4",
1530 FALSE, 'Q', ARG_INT, 0.0, 0, NULL},
1531 {"Highest Severity for Error to Show", "4", "0", "4",
1532 FALSE, 'P', ARG_INT, 0.0, 0, NULL},
1533 {"Only Error Code to Show", NULL, NULL, NULL,
1534 TRUE, 'E', ARG_STRING, 0.0, 0, NULL},
1535 {"Validate Alignments", "F", NULL, NULL,
1536 TRUE, 'A', ARG_BOOLEAN, 0.0, 0, NULL},
1537 {"Require ISO-JTA?", "F", NULL, NULL,
1538 TRUE, 'J', ARG_BOOLEAN, 0.0, 0, NULL},
1539 {"Remote CDS Product Fetch", "F", NULL, NULL,
1540 TRUE, 'Z', ARG_BOOLEAN, 0.0, 0, NULL},
1541 {"Exon Splice Check", "F", NULL, NULL,
1542 TRUE, 'X', ARG_BOOLEAN, 0.0, 0, NULL},
1543 {"Verify Inference Accessions", "F", NULL, NULL,
1544 TRUE, 'G', ARG_BOOLEAN, 0.0, 0, NULL},
1545 {"LatLon/Country Flags (1 Test State/Province, 2 Ignore Water Exception)", "0", "0", "3",
1546 TRUE, 'N', ARG_INT, 0.0, 0, NULL},
1547 {"Match locus_tag against General ID", "F", NULL, NULL,
1548 TRUE, 'M', ARG_BOOLEAN, 0.0, 0, NULL},
1549 {"Check Against Old IDs", "F", NULL, NULL,
1550 TRUE, 'Y', ARG_BOOLEAN, 0.0, 0, NULL},
1551 {"Ignore Transcription/Translation Exceptions", "F", NULL, NULL,
1552 TRUE, 'e', ARG_BOOLEAN, 0.0, 0, NULL},
1553 {"Verbosity", "1", "0", "4",
1554 FALSE, 'v', ARG_INT, 0.0, 0, NULL},
1555 {"ASN.1 Type (a Automatic, z Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit, t Batch Bioseq-set, u Batch Seq-submit)", "a", NULL, NULL,
1556 TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
1557 {"Batch File is Binary", "F", NULL, NULL,
1558 TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
1559 {"Batch File is Compressed", "F", NULL, NULL,
1560 TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
1561 {"Remote Fetching from ID", "F", NULL, NULL,
1562 TRUE, 'r', ARG_BOOLEAN, 0.0, 0, NULL},
1563 {"Local Fetching", "F", NULL, NULL,
1564 TRUE, 'k', ARG_BOOLEAN, 0.0, 0, NULL},
1565 {"Path to Indexed Binary ASN.1 Data", NULL, NULL, NULL,
1566 TRUE, 'd', ARG_STRING, 0.0, 0, NULL},
1567 {"Lock Components in Advance", "F", NULL, NULL,
1568 TRUE, 'l', ARG_BOOLEAN, 0.0, 0, NULL},
1569 {"Use Threads", "F", NULL, NULL,
1570 TRUE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},
1571 {"Log File", NULL, NULL, NULL,
1572 TRUE, 'L', ARG_FILE_OUT, 0.0, 0, NULL},
1573 {"Summary to Error File", "F", NULL, NULL,
1574 TRUE, 'K', ARG_BOOLEAN, 0.0, 0, NULL},
1575 {"Skip Count", "0", NULL, NULL,
1576 TRUE, 'S', ARG_INT, 0.0, 0, NULL},
1577 {"Barcode Validate", "F", NULL, NULL,
1578 TRUE, 'B', ARG_BOOLEAN, 0.0, 0, NULL},
1579 {"Max Count", "0", NULL, NULL,
1580 TRUE, 'C', ARG_INT, 0.0, 0, NULL},
1581 #ifdef INTERNAL_NCBI_ASN2VAL
1582 {"SeqSubmitParent Flag", "F", NULL, NULL,
1583 TRUE, 'w', ARG_BOOLEAN, 0.0, 0, NULL},
1584 {"Internal Access to HUP", "F", NULL, NULL,
1585 TRUE, 'H', ARG_BOOLEAN, 0.0, 0, NULL},
1586 {"Special Indexer Tests", "F", NULL, NULL,
1587 TRUE, 'y', ARG_BOOLEAN, 0.0, 0, NULL},
1588 #endif
1589 };
1590
1591 Int2 Main (void)
1592
1593 {
1594 Char app [64];
1595 CharPtr asnidx, directory, infile, logfile, outfile, str, suffix;
1596 Boolean automatic, batch, binary, compressed, dorecurse,
1597 indexed, local, lock, remote, summary, usethreads;
1598 #ifdef INTERNAL_NCBI_ASN2VAL
1599 Boolean hup = FALSE;
1600 #endif
1601 time_t run_time, start_time, stop_time;
1602 Int2 type = 0, val;
1603 ValFlagData vfd;
1604
1605 /* standard setup */
1606
1607 ErrSetFatalLevel (SEV_MAX);
1608 ErrSetMessageLevel (SEV_MAX);
1609 ErrClearOptFlags (EO_SHOW_USERSTR);
1610 ErrSetLogfile ("stderr", ELOG_APPEND);
1611 ErrSetOpts (ERR_IGNORE, ERR_LOG_ON);
1612
1613 UseLocalAsnloadDataAndErrMsg ();
1614 ErrPathReset ();
1615
1616 if (! AllObjLoad ()) {
1617 Message (MSG_FATAL, "AllObjLoad failed");
1618 return 1;
1619 }
1620 if (! SubmitAsnLoad ()) {
1621 Message (MSG_FATAL, "SubmitAsnLoad failed");
1622 return 1;
1623 }
1624 if (! FeatDefSetLoad ()) {
1625 Message (MSG_FATAL, "FeatDefSetLoad failed");
1626 return 1;
1627 }
1628 if (! SeqCodeSetLoad ()) {
1629 Message (MSG_FATAL, "SeqCodeSetLoad failed");
1630 return 1;
1631 }
1632 if (! GeneticCodeTableLoad ()) {
1633 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
1634 return 1;
1635 }
1636
1637 /* process command line arguments */
1638
1639 sprintf (app, "asnval %s", ASNVAL_APPLICATION);
1640 if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
1641 return 0;
1642 }
1643
1644 /* additional setup modifications */
1645
1646 MemSet ((Pointer) &vfd, 0, sizeof (ValFlagData));
1647
1648 directory = (CharPtr) myargs [p_argInputPath].strvalue;
1649 suffix = (CharPtr) myargs [x_argSuffix].strvalue;
1650 infile = (CharPtr) myargs [i_argInputFile].strvalue;
1651 outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
1652 dorecurse = (Boolean) myargs [u_argRecurse].intvalue;
1653 remote = (Boolean ) myargs [r_argRemote].intvalue;
1654 local = (Boolean) myargs [k_argLocalFetch].intvalue;
1655 #ifdef INTERNAL_NCBI_ASN2VAL
1656 hup = (Boolean) myargs [H_argAccessHUP].intvalue;
1657 #endif
1658 asnidx = (CharPtr) myargs [d_argAsnIdx].strvalue;
1659 indexed = (Boolean) StringDoesHaveText (asnidx);
1660 lock = (Boolean) myargs [l_argLockFar].intvalue;
1661 usethreads = (Boolean) myargs [T_argThreads].intvalue;
1662
1663 vfd.severity = (Int2) myargs [R_argSeverity].intvalue;
1664 vfd.lowCutoff = (Int2) myargs [Q_argLowCutoff].intvalue;
1665 vfd.highCutoff = (Int2) myargs [P_argHighCutoff].intvalue;
1666 vfd.errcode = (CharPtr) myargs [E_argOnlyThisErr].strvalue;
1667 vfd.validateAlignments = (Boolean) myargs [A_argAlignments].intvalue;
1668 vfd.alignFindRemoteBsp = (Boolean) (vfd.validateAlignments && remote);
1669 vfd.doSeqHistAssembly = (Boolean) myargs [A_argAlignments].intvalue;
1670 vfd.farIDsInAlignments = (Boolean) myargs [A_argAlignments].intvalue;
1671 vfd.alwaysRequireIsoJTA = (Boolean) myargs [J_argIsoJta].intvalue;
1672 vfd.farFetchCDSproducts = (Boolean) myargs [Z_argRemoteCDS].intvalue;
1673 vfd.farFetchMRNAproducts = (Boolean) myargs [Z_argRemoteCDS].intvalue;
1674 vfd.locusTagGeneralMatch = (Boolean) myargs [M_argMatchTag].intvalue;
1675 vfd.validateIDSet = (Boolean) myargs [Y_argCheckOld].intvalue;
1676 vfd.ignoreExceptions = (Boolean) myargs [e_argIgnoreExcept].intvalue;
1677 vfd.validateExons = (Boolean) myargs [X_argExonSplice].intvalue;
1678 vfd.inferenceAccnCheck = (Boolean) myargs [G_argInfAccns].intvalue;
1679 vfd.validateBarcode = (Boolean) myargs[B_argBarcodeVal].intvalue;
1680
1681
1682 val = (Int2) myargs [N_argLatLonStrict].intvalue;
1683 vfd.testLatLonSubregion = (Boolean) ((val & LAT_LON_STATE) != 0);
1684 vfd.strictLatLonCountry = (Boolean) ((val & LAT_LON_STRICT) != 0);
1685
1686 vfd.verbosity = (Int2) myargs [v_argVerbosity].intvalue;
1687
1688 vfd.skipcount = (Int4) myargs [S_argSkipCount].intvalue;
1689 vfd.maxcount = (Int4) myargs [C_argMaxCount].intvalue;
1690 if (vfd.maxcount < 1) {
1691 vfd.maxcount = INT4_MAX;
1692 }
1693
1694 #ifdef INTERNAL_NCBI_ASN2VAL
1695 vfd.seqSubmitParent = (Boolean) myargs [w_argSeqSubParent].intvalue;
1696 vfd.indexerVersion = (Boolean) myargs [y_argAIndexer].intvalue;
1697 #endif
1698
1699 #ifdef INTERNAL_NCBI_ASN2VAL
1700 SetAppProperty ("InternalNcbiSequin", (void *) 1024);
1701 #endif
1702
1703 automatic = FALSE;
1704 batch = FALSE;
1705 binary = (Boolean) myargs [b_argBinary].intvalue;
1706 compressed = (Boolean) myargs [c_argCompressed].intvalue;
1707
1708 str = myargs [a_argType].strvalue;
1709 if (StringICmp (str, "a") == 0) {
1710 type = 1;
1711 automatic = TRUE;
1712 } else if (StringICmp (str, "z") == 0) {
1713 type = 1;
1714 } else if (StringICmp (str, "e") == 0) {
1715 type = 2;
1716 } else if (StringICmp (str, "b") == 0) {
1717 type = 3;
1718 } else if (StringICmp (str, "s") == 0) {
1719 type = 4;
1720 } else if (StringICmp (str, "m") == 0) {
1721 type = 5;
1722 } else if (StringICmp (str, "t") == 0) {
1723 type = 4;
1724 batch = TRUE;
1725 } else if (StringICmp (str, "u") == 0) {
1726 type = 5;
1727 batch = TRUE;
1728 } else {
1729 type = 1;
1730 }
1731
1732 if ((binary || compressed) && (! batch)) {
1733 if (type == 1) {
1734 Message (MSG_FATAL, "-b or -c cannot be used without -t or -a");
1735 return 1;
1736 }
1737 }
1738
1739 if (StringHasNoText (directory) && StringHasNoText (infile)) {
1740 Message (MSG_FATAL, "Input path or input file must be specified");
1741 return 1;
1742 }
1743
1744 logfile = (CharPtr) myargs [L_argLogFile].strvalue;
1745 summary = (Boolean) myargs [K_argSummmary].intvalue;
1746
1747 start_time = GetSecs ();
1748
1749 /* populate parameter structure */
1750
1751 vfd.automatic = automatic;
1752 vfd.batch = batch;
1753 vfd.binary = binary;
1754 vfd.compressed = compressed;
1755 vfd.lock = lock;
1756 vfd.useThreads = usethreads;
1757 vfd.type = type;
1758 vfd.logfp = NULL;
1759 vfd.num_errors = 0;
1760 vfd.fatal_errors = 0;
1761 vfd.has_errors = FALSE;
1762 vfd.io_failure = FALSE;
1763 vfd.longest [0] = '\0';
1764 vfd.worsttime = 0;
1765 vfd.numrecords = 0;
1766
1767 if (! StringHasNoText (outfile)) {
1768 if (vfd.verbosity == 0) {
1769 vfd.outpath = outfile;
1770 } else {
1771 vfd.outfp = FileOpen (outfile, "w");
1772 if (vfd.outfp == NULL) {
1773 Message (MSG_FATAL, "Unable to open single output file");
1774 return 1;
1775 }
1776 }
1777 }
1778
1779 if (! StringHasNoText (logfile)) {
1780 vfd.logfp = FileOpen (logfile, "w");
1781 if (vfd.logfp == NULL) {
1782 Message (MSG_FATAL, "Unable to open log file");
1783 return 1;
1784 }
1785 }
1786
1787 /* register fetch functions */
1788
1789 if (remote) {
1790 #ifdef INTERNAL_NCBI_ASN2VAL
1791 if (hup) {
1792 DirSubFetchEnable ();
1793 SmartFetchEnable ();
1794 TPASmartFetchEnable ();
1795 }
1796
1797 if (! PUBSEQBioseqFetchEnable ("asnval", FALSE)) {
1798 Message (MSG_POSTERR, "PUBSEQBioseqFetchEnable failed");
1799 return 1;
1800 }
1801 vfd.usePUBSEQ = TRUE;
1802 vfd.useThreads = FALSE;
1803 #else
1804 PubSeqFetchEnable ();
1805 #endif
1806 if (vfd.inferenceAccnCheck) {
1807 SeqMgrSetPreCache (GiRevHistLookupFarSeqIDs);
1808 }
1809 if (vfd.validateIDSet) {
1810 SeqMgrSetSeqIdSetFunc (GiRevHistLookupSeqIdSet);
1811 }
1812 }
1813
1814 if (local) {
1815 LocalSeqFetchInit (FALSE);
1816 }
1817
1818 if (indexed) {
1819 AsnIndexedLibFetchEnable (asnidx, TRUE);
1820 }
1821
1822 /* recurse through all files within source directory or subdirectories */
1823
1824 if (StringDoesHaveText (directory)) {
1825
1826 DirExplore (directory, NULL, suffix, dorecurse, ProcessOneRecord, (Pointer) &vfd);
1827
1828 } else if (StringDoesHaveText (infile)) {
1829
1830 ProcessOneRecord (infile, (Pointer) &vfd);
1831 }
1832
1833 stop_time = GetSecs ();
1834 run_time = stop_time - start_time;
1835
1836 if (vfd.outfp != NULL) {
1837 if (vfd.has_errors) {
1838 if (vfd.verbosity == 4) {
1839 fprintf (vfd.outfp, "</asnval>\n");
1840 }
1841 vfd.has_errors = FALSE;
1842 }
1843 if (summary) {
1844 fprintf (vfd.outfp, "Finished in %ld seconds\n", (long) run_time);
1845 if (StringDoesHaveText (vfd.longest)) {
1846 fprintf (vfd.outfp, "Longest processing time %ld seconds on %s\n",
1847 (long) vfd.worsttime, vfd.longest);
1848 fprintf (vfd.outfp, "Total number of records %ld\n", (long) vfd.numrecords);
1849 }
1850 }
1851 FileClose (vfd.outfp);
1852 }
1853
1854 if (vfd.logfp != NULL) {
1855 fprintf (vfd.logfp, "Finished in %ld seconds\n", (long) run_time);
1856 if (StringDoesHaveText (vfd.longest)) {
1857 fprintf (vfd.logfp, "Longest processing time %ld seconds on %s\n",
1858 (long) vfd.worsttime, vfd.longest);
1859 fprintf (vfd.logfp, "Total number of records %ld\n", (long) vfd.numrecords);
1860 }
1861 FileClose (vfd.logfp);
1862 }
1863
1864 /* close fetch functions */
1865
1866 if (indexed) {
1867 AsnIndexedLibFetchDisable ();
1868 }
1869
1870 if (local) {
1871 LocalSeqFetchDisable ();
1872 }
1873
1874 if (remote) {
1875 #ifdef INTERNAL_NCBI_ASN2VAL
1876 PUBSEQBioseqFetchDisable ();
1877 #else
1878 PubSeqFetchDisable ();
1879 #endif
1880 SeqMgrSetPreCache (NULL);
1881 SeqMgrSetSeqIdSetFunc (NULL);
1882 }
1883
1884 TransTableFreeAll ();
1885
1886 ECNumberFSAFreeAll ();
1887
1888 if (vfd.fatal_errors > 0) return 1;
1889 if (vfd.io_failure) return 1;
1890
1891 return 0;
1892 }
1893
1894 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |