|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/demo/aceread_tst.c |
source navigation diff markup identifier search freetext search file search |
1 /* aceread_tst.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: aceread_tst.c
27 *
28 * Author: Colleen Bollin
29 *
30 * Version Creation Date: 7/22/08
31 *
32 * $Revision: 1.27 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45 #include <ncbi.h>
46 #include <objall.h>
47 #include <objsset.h>
48 #include <objsub.h>
49 #include <objfdef.h>
50 #include <seqport.h>
51 #include <sequtil.h>
52 #include <sqnutils.h>
53 #include <subutil.h>
54 #include <gather.h>
55 #include <explore.h>
56 #include <lsqfetch.h>
57 #include <valid.h>
58 #include <pmfapi.h>
59 #ifdef INTERNAL_NCBI_ASNDISC
60 #include <accpubseq.h>
61 #include <tax3api.h>
62 #endif
63
64 #include "aceread.h"
65 #include "acerdapi.h"
66
67 typedef enum {
68 i_argInputFile,
69 o_argOutputFile,
70 f_argFASTA,
71 S_argIDSubstitutionFile,
72 R_argSRRids,
73 L_argSuppressIdLookup,
74 Q_argMakeQualScores,
75 X_argXMLFile,
76 t_argTemplateFile,
77 T_argTSAFields,
78 C_argCenter,
79 F_argFormat,
80 G_argGapString,
81 V_argValidateAgainstAsn1File,
82 q_argReadQualScoresFile,
83 r_argReadFASTAFile,
84 N_argRecalculateConsensus,
85 c_argChunkSize,
86 n_argReadNameType,
87 z_argIncludeReads,
88 l_argLimitNumContigs
89 } EArgNum;
90
91 Args myargs [] = {
92 {"Single Input File", "stdin", NULL, NULL,
93 TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
94 {"Single Output File", NULL, NULL, NULL,
95 TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
96 {"FASTA Output", "F", NULL, NULL,
97 TRUE, 'f', ARG_BOOLEAN, 0.0, 0, NULL},
98 {"ID Substitution File", "", NULL, NULL,
99 TRUE, 'S', ARG_FILE_IN, 0.0, 0, NULL},
100 {"Replacement IDs are SRR", "F", NULL, NULL,
101 TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
102 {"Suppress ID Lookup", "F", NULL, NULL,
103 TRUE, 'L', ARG_BOOLEAN, 0.0, 0, NULL},
104 {"Make Qual Scores", "T", NULL, NULL,
105 TRUE, 'Q', ARG_BOOLEAN, 0.0, 0, NULL},
106 {"XML Output File", "", NULL, NULL,
107 TRUE, 'X', ARG_FILE_OUT, 0.0, 0, NULL },
108 {"Template File", "", NULL, NULL,
109 TRUE, 't', ARG_FILE_IN, 0.0, 0, NULL },
110 {"TSA fields", NULL, NULL, NULL,
111 TRUE, 'T', ARG_STRING, 0.0, 0, NULL },
112 {"Genome Center Tag", NULL, NULL, NULL,
113 TRUE, 'C', ARG_STRING, 0.0, 0, NULL},
114 {"Assembly Format\n\tM MAQ\n\tE Standalone Eland\n\tA ACE", "A", NULL, NULL,
115 TRUE, 'F', ARG_STRING, 0.0, 0, NULL},
116 {"Gap String", NULL, NULL, NULL,
117 TRUE, 'G', ARG_STRING, 0.0, 0, NULL},
118 {"ASN.1 File to validate against", NULL, NULL, NULL,
119 TRUE, 'V', ARG_FILE_IN, 0.0, 0, NULL},
120 {"Quality score file for read sequences", NULL, NULL, NULL,
121 TRUE, 'q', ARG_FILE_IN, 0.0, 0, NULL},
122 {"FASTA file for read sequences (to use when trimming read quality scores)", NULL, NULL, NULL,
123 TRUE, 'r', ARG_FILE_IN, 0.0, 0, NULL},
124 {"Recalculate consensus sequence using read data\n\tW Whole Consensus\n\tN Ns Only", "", NULL, NULL,
125 TRUE, 'N', ARG_STRING, 0.0, 0, NULL},
126 {"Number of contig bases per file", "50000", NULL, NULL,
127 TRUE, 'c', ARG_INT, 0.0, 0, NULL},
128 {"Read name type in ACE file\n\tL local trace name\n\tT TI number\n\tS SRR ID\n", "L", NULL, NULL,
129 TRUE, 'n', ARG_STRING, 0.0, 0, NULL},
130 {"Include read sequences in ASN.1 output", "F", NULL, NULL,
131 TRUE, 'z', ARG_BOOLEAN, 0.0, 0, NULL},
132 {"Limit number of contigs to read", NULL, NULL, NULL,
133 TRUE, 'l', ARG_INT, 0.0, 0, NULL},
134 };
135
136
137 static FILE *OpenAceFile (CharPtr infile)
138 {
139 FILE *f;
140 Int4 len;
141 #ifdef OS_UNIX
142 Char cmmd [256];
143 CharPtr gzcatprog;
144 int ret;
145 Boolean usedPopen = FALSE;
146 #endif
147
148 len = StringLen (infile);
149 if (StringCmp (infile + len - 3, ".gz") == 0) {
150 #ifdef OS_UNIX
151 gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
152 if (gzcatprog != NULL) {
153 sprintf (cmmd, "%s %s", gzcatprog, infile);
154 } else {
155 ret = system ("gzcat -h >/dev/null 2>&1");
156 if (ret == 0) {
157 sprintf (cmmd, "gzcat %s", infile);
158 } else if (ret == -1) {
159 Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
160 return NULL;
161 } else {
162 ret = system ("zcat -h >/dev/null 2>&1");
163 if (ret == 0) {
164 sprintf (cmmd, "zcat %s", infile);
165 } else if (ret == -1) {
166 Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease");
167 return NULL;
168 } else {
169 Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
170 return NULL;
171 }
172 }
173 }
174 f = popen (cmmd, "r");
175 usedPopen = TRUE;
176 #else
177 Message (MSG_POSTERR, "Unable to read gzipped files when not running in UNIX");
178 return NULL;
179 #endif
180 } else {
181 f = FileOpen (infile, "r");
182 }
183 return f;
184 }
185
186
187 static Boolean ValidateAgainstASNFile (TACEFilePtr ace_file, CharPtr filename, char *has_errors)
188 {
189 Pointer dataptr;
190 Uint2 datatype;
191 SeqEntryPtr sep = NULL;
192 SeqSubmitPtr ssp = NULL;
193 Boolean chars_stripped = FALSE;
194 FILE *fp;
195 Boolean rval = FALSE;
196
197
198 fp = FileOpen (filename, "r");
199 if (fp == NULL) {
200 printf ("Unable to open %s\n", filename);
201 return FALSE;
202 }
203
204 /* Read in one sequence from the file */
205 dataptr = ReadAsnFastaOrFlatFileEx (fp, &datatype, NULL, FALSE, FALSE,
206 TRUE, FALSE, &chars_stripped);
207 FileClose (fp);
208 if (NULL == dataptr)
209 {
210 printf ("Unable to read SeqEntry from %s\n", filename);
211 return FALSE;
212 }
213
214 /* Convert the file data to a SeqEntry */
215
216 if (datatype == OBJ_SEQENTRY)
217 sep = (SeqEntryPtr) dataptr;
218 else if (datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET)
219 sep = SeqMgrGetSeqEntryForData (dataptr);
220 else if (datatype == OBJ_SEQSUB)
221 {
222 ssp = (SeqSubmitPtr) dataptr;
223 if (ssp != NULL && ssp->datatype == 1)
224 {
225 sep = (SeqEntryPtr) ssp->data;
226 }
227 }
228
229 rval = ValidateACEFileAgainstSeqEntry (ace_file, sep, has_errors);
230
231 if (ssp != NULL) {
232 ssp = SeqSubmitFree (ssp);
233 } else {
234 sep = SeqEntryFree (sep);
235 }
236 return rval;
237
238 }
239
240
241 static Boolean StringNHasNoText (CharPtr str, Int4 n)
242 {
243 CharPtr cp;
244 Int4 i;
245 if (str == NULL) return TRUE;
246 cp = str;
247 i = 0;
248 while (i < n) {
249 if (*cp == 0) return TRUE;
250 if (!isspace (*cp)) return FALSE;
251 cp++;
252 i++;
253 }
254 return TRUE;
255 }
256
257
258 static Boolean BracketMatchesLabel (CharPtr cp, CharPtr cp_equal, CharPtr label)
259 {
260 Int4 len;
261
262 if (cp == NULL || cp_equal == NULL || label == NULL) return FALSE;
263
264 len = StringLen (label);
265 if (StringNCmp (cp, label, len) == 0
266 && StringNHasNoText (cp + len, cp_equal - cp - len)) {
267 return TRUE;
268 } else {
269 return FALSE;
270 }
271 }
272
273
274 static CharPtr GetBracketValue (CharPtr cp, CharPtr cp_end)
275 {
276 Int4 len;
277 CharPtr val = NULL;
278
279 if (cp == NULL || cp_end == NULL || cp_end <= cp) return NULL;
280
281 cp += StringSpn (cp, " \t");
282 len = (cp_end - cp) + 1;
283 val = (CharPtr) MemNew (sizeof (Char) * len);
284 StringNCpy (val, cp, len - 1);
285 val [len] = 0;
286 while (len > 1 && isspace (val [len-1])) {
287 len--;
288 val[len] = 0;
289 }
290 return val;
291 }
292
293
294 static Boolean
295 GetTSAFieldsFromString
296 (CharPtr str,
297 CharPtr PNTR p_submitter_reference,
298 CharPtr PNTR p_archive_id,
299 CharPtr PNTR p_description,
300 CharPtr PNTR p_assembly,
301 Int4Ptr p_taxon_id)
302 {
303 CharPtr cp, cp_next, cp_equal, cp_end;
304 CharPtr subref = NULL, arch_id = NULL, desc = NULL, assembly = NULL, tmp;
305 Boolean is_bad = FALSE;
306
307 if (p_submitter_reference != NULL) {
308 *p_submitter_reference = NULL;
309 }
310 if (p_archive_id != NULL) {
311 *p_archive_id = NULL;
312 }
313 if (p_submitter_reference != NULL) {
314 *p_description = NULL;
315 }
316 if (StringHasNoText (str)) {
317 return TRUE;
318 }
319
320 cp = StringChr (str, '[');
321 while (cp != NULL && !is_bad) {
322 cp++;
323 cp_next = StringChr (cp + 1, '[');
324 cp_equal = StringChr (cp, '=');
325 cp_end = StringChr (cp, ']');
326 if (cp_equal == NULL || cp_end == NULL) {
327 is_bad = TRUE;
328 } else if (cp_equal > cp_end) {
329 is_bad = TRUE;
330 } else if (cp_next != NULL && (cp_equal > cp_next || cp_end > cp_next)) {
331 is_bad = TRUE;
332 } else {
333 cp += StringSpn (cp, " \t");
334 if (BracketMatchesLabel (cp, cp_equal, "subref")) {
335 if (subref == NULL) {
336 subref = GetBracketValue (cp_equal + 1, cp_end);
337 } else {
338 is_bad = TRUE;
339 }
340 } else if (BracketMatchesLabel (cp, cp_equal, "archive_id")) {
341 if (arch_id == NULL) {
342 arch_id = GetBracketValue (cp_equal + 1, cp_end);
343 } else {
344 is_bad = TRUE;
345 }
346 } else if (BracketMatchesLabel (cp, cp_equal, "desc")) {
347 if (desc == NULL) {
348 desc = GetBracketValue (cp_equal + 1, cp_end);
349 } else {
350 is_bad = TRUE;
351 }
352 } else if (BracketMatchesLabel (cp, cp_equal, "assembly")) {
353 if (assembly == NULL) {
354 assembly = GetBracketValue (cp_equal + 1, cp_end);
355 } else {
356 is_bad = TRUE;
357 }
358 } else if (BracketMatchesLabel (cp, cp_equal, "taxon_id")) {
359 tmp = GetBracketValue (cp_equal + 1, cp_end);
360 if (p_taxon_id != NULL) {
361 *p_taxon_id = atoi (tmp);
362 }
363 } else {
364 is_bad = TRUE;
365 }
366 }
367 cp = cp_next;
368 }
369 if (p_submitter_reference == NULL) {
370 subref = MemFree (subref);
371 } else {
372 *p_submitter_reference = subref;
373 }
374 if (p_archive_id == NULL) {
375 arch_id = MemFree (arch_id);
376 } else {
377 *p_archive_id = arch_id;
378 }
379 if (p_description == NULL) {
380 desc = MemFree (desc);
381 } else {
382 *p_description = desc;
383 }
384 if (p_assembly == NULL) {
385 assembly = MemFree (assembly);
386 } else {
387 *p_assembly = assembly;
388 }
389 return TRUE;
390 }
391
392
393 static void PrintTraceGapsXML (TGapInfoPtr gap_info)
394 {
395 Int4 i;
396
397 if (gap_info != NULL) {
398 printf (" <ntracegaps>%d</ntracegaps>\n", gap_info->num_gaps);
399 if (gap_info->num_gaps > 0) {
400 printf (" <tracegaps source=\"INLINE\">");
401 for (i = 0; i < gap_info->num_gaps - 1; i++) {
402 printf ("%d,", gap_info->gap_offsets[i]);
403 }
404 printf ("%d</tracegaps>\n", gap_info->gap_offsets[gap_info->num_gaps - 1]);
405 }
406 }
407 }
408
409
410 static void TestPosConversions (TGapInfoPtr gap_info)
411 {
412 Int4 i, t_pos, s_pos = 0, r_pos;
413 Int4 test_len = 0;
414
415 if (gap_info != NULL && gap_info->num_gaps > 0) {
416 for (i = 0; i < gap_info->num_gaps; i++) {
417 test_len += gap_info->gap_offsets[i] + 1;
418 }
419 for (i = 0; i < test_len; i++) {
420 s_pos = SeqPosFromTilingPos (i, gap_info);
421 t_pos = TilingPosFromSeqPos (s_pos, gap_info);
422 if (t_pos != i) {
423 printf ("Failed! %d -> SeqPosFromTilingPos -> %d -> TilingPosFromSeqPos -> %d\n",
424 i, s_pos, t_pos);
425 }
426 r_pos = SeqPosFromTilingPos (t_pos, gap_info);
427 if (r_pos != s_pos) {
428 printf ("Failed! %d -> TilingPosFromSeqPos -> %d -> SeqPosFromTilingPos -> %d\n",
429 s_pos, t_pos, r_pos);
430 }
431 /* printf ("%d:%d:%d:%d\n", i, s_pos, t_pos, r_pos); */
432 }
433 }
434 }
435
436
437 static void PrintTraceReadXML (TContigReadPtr read)
438 {
439 if (read == NULL) {
440 printf ("Bad read\n");
441 } else {
442 printf ("<trace>\n");
443 printf (" <trace_name>%s</trace_name>\n", read->read_id == NULL ? "" : read->read_id);
444 PrintTraceGapsXML (read->gaps);
445 printf (" <nbasecalls>%d</nbasecalls>\n", StringLen (read->read_seq));
446 printf (" <valid>\n");
447 printf (" <start>%d</start>\n", read->read_assem_start + 1);
448 printf (" <stop>%d</stop>\n", read->read_assem_stop + 1);
449 printf (" </valid>\n");
450 printf (" <tiling direction = \"%s\">\n", read->is_complement ? "REVERSE" : "FORWARD");
451 printf (" <start>%d</start>\n", read->cons_start + 1);
452 printf (" <start>%d</start>\n", read->cons_start + StringLen (read->read_seq) + 1);
453 printf (" </tiling>\n");
454 printf (" <consensus>\n");
455 printf (" <start>%d</start>\n", read->cons_start + 1);
456 printf (" <start>%d</start>\n", read->cons_start + StringLen (read->read_seq) + 1);
457 printf (" </consensus>\n");
458 printf ("<trace>\n");
459 }
460 }
461
462
463
464 static void TestGapInfoReading (CharPtr gap_string)
465 {
466 TGapInfoPtr gap_info;
467 ValNodePtr list, vnp;
468
469 if (!StringHasNoText (gap_string)) {
470 gap_info = GapInfoFromSequenceString(gap_string, "*");
471 if (gap_info == NULL) {
472 printf ("error reading");
473 } else {
474 PrintTraceGapsXML (gap_info);
475 TestPosConversions (gap_info);
476 list = GetTransitionsFromGapInfo (gap_info, 0, 0, 40);
477 for (vnp = list; vnp != NULL; vnp = vnp->next) {
478 printf ("%d\n", vnp->data.intvalue);
479 }
480 }
481 GapInfoFree (gap_info);
482 }
483 }
484
485
486 static void AddAlignmentToSeqEntry (DenseSegPtr dsp, SeqEntryPtr sep)
487 {
488 SeqAnnotPtr sap;
489 SeqAlignPtr salp;
490 BioseqPtr bsp;
491 BioseqSetPtr bssp;
492
493 if (dsp == NULL || sep == NULL) return;
494
495 sap = SeqAnnotNew ();
496 sap->type = 2;
497
498 salp = SeqAlignNew ();
499 salp->type = 3;
500 salp->segtype = 2;
501 salp->segs = (Pointer) dsp;
502 salp->dim = dsp->dim;
503 sap->data = (Pointer) salp;
504
505 if (IS_Bioseq (sep)) {
506 bsp = (BioseqPtr) sep->data.ptrvalue;
507 sap->next = bsp->annot;
508 bsp->annot = sap;
509 } else if (IS_Bioseq_set (sep)) {
510 bssp = (BioseqSetPtr) sep->data.ptrvalue;
511 sap->next = bssp->annot;
512 bssp->annot = sap;
513 }
514 }
515
516
517 static void AddDescrToNucBioseqCallback (BioseqPtr bsp, Pointer data)
518 {
519 SeqDescrPtr sdp, sdp_copy;
520
521 if (bsp == NULL || !ISA_na (bsp->mol) || data == NULL) {
522 return;
523 }
524 sdp = (SeqDescrPtr) data;
525 while (sdp != NULL) {
526 if (sdp->choice != Seq_descr_pub) {
527 sdp_copy = (SeqDescrPtr) AsnIoMemCopy (sdp, (AsnReadFunc) SeqDescrAsnRead, (AsnWriteFunc) SeqDescrAsnWrite);
528 sdp_copy->next = bsp->descr;
529 bsp->descr = sdp_copy;
530 }
531 sdp = sdp->next;
532 }
533 }
534
535
536 static SeqSubmitPtr AddSeqSubmitFromTemplate (SeqEntryPtr sep, CharPtr filename)
537 {
538 SeqSubmitPtr ssp = NULL;
539 SubmitBlockPtr sbp;
540 CitSubPtr csp;
541 FILE *fp = NULL;
542 Pointer dataptr;
543 Uint2 datatype;
544
545 if (StringHasNoText (filename)) {
546 return NULL;
547 }
548
549 fp = FileOpen (filename, "r");
550 if (fp == NULL) {
551 printf ("Unable to read template file %s\n", filename);
552 return NULL;
553 }
554
555 while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
556 if (datatype == OBJ_SEQSUB) {
557 ssp = (SeqSubmitPtr) dataptr;
558 ssp->datatype = 1;
559 ssp->data = sep;
560 } else if (datatype == OBJ_SUBMIT_BLOCK) {
561 sbp = (SubmitBlockPtr) dataptr;
562 ssp = SeqSubmitNew ();
563 ssp->datatype = 1;
564 ssp->data = sep;
565 ssp->sub = sbp;
566 } else if (datatype == OBJ_SEQDESC) {
567 VisitBioseqsInSep (sep, dataptr, AddDescrToNucBioseqCallback);
568 ObjMgrFree (datatype, dataptr);
569 } else {
570 ObjMgrFree (datatype, dataptr);
571 }
572 }
573 FileClose (fp);
574 if (ssp == NULL) {
575 ssp = SeqSubmitNew ();
576 ssp->datatype = 1;
577 ssp->data = sep;
578 }
579
580 if (ssp->sub == NULL) {
581 ssp->sub = SubmitBlockNew ();
582 }
583
584 ssp->sub->tool = MemFree (ssp->sub->tool);
585 ssp->sub->tool = StringSave ("aceread");
586 ssp->sub->hup = FALSE;
587 ssp->sub->reldate = DateFree (ssp->sub->reldate);
588 csp = ssp->sub->cit;
589 if (csp != NULL) {
590 csp->date = DateFree (csp->date);
591 csp->date = DateCurr ();
592 }
593 return ssp;
594 }
595
596
597 static Boolean AddReadQualityScores (TACEFilePtr afp, CharPtr qs_filename, CharPtr rd_filename)
598 {
599 ReadBufferData q, r;
600 Boolean use_fasta = FALSE;
601 Boolean rval = FALSE;
602
603 if (afp == NULL || StringHasNoText (qs_filename)) {
604 return TRUE;
605 }
606
607 q.current_data = NULL;
608 r.current_data = NULL;
609
610 q.fp = FileOpen (qs_filename, "r");
611 if (q.fp == NULL) {
612 printf ("Unable to read quality score file\n");
613 return FALSE;
614 }
615
616 if (!StringHasNoText (rd_filename)) {
617 r.fp = FileOpen (rd_filename, "r");
618 if (r.fp == NULL) {
619 printf ("Unable to open read FASTA file\n");
620 FileClose (q.fp);
621 return FALSE;
622 }
623 use_fasta = TRUE;
624 }
625
626 if (AddReadQualScores (afp, AbstractReadFunction, &q, use_fasta ? AbstractReadFunction : NULL, &r) > 0) {
627 rval = TRUE;
628 }
629
630 FileClose (q.fp);
631 if (use_fasta) {
632 FileClose (r.fp);
633 }
634 return rval;
635 }
636
637
638 static Boolean LIBCALL MyBioseqSetAsnWrite (BioseqSetPtr bsp, AsnIoPtr aip, AsnTypePtr orig)
639 {
640 DataVal av;
641 AsnTypePtr atp;
642 Boolean retval = FALSE;
643
644 if (aip == NULL)
645 return FALSE;
646
647 atp = AsnLinkType(orig, AsnFind ("Bioseq-set")); /* link local tree */
648 if (atp == NULL) return FALSE;
649
650 if (bsp == NULL) { AsnNullValueMsg(aip, atp); goto erret; }
651
652 if (! AsnOpenStruct(aip, atp, (Pointer)bsp)) goto erret;
653
654 if (bsp->id != NULL)
655 {
656 if (! ObjectIdAsnWrite(bsp->id, aip, AsnFind ("Bioseq-set.id"))) goto erret;
657 }
658 if (bsp->coll != NULL)
659 {
660 if (! DbtagAsnWrite(bsp->coll, aip, AsnFind ("Bioseq-set.coll"))) goto erret;
661 }
662 if (bsp->level != INT2_MIN)
663 {
664 av.intvalue = bsp->level;
665 if (! AsnWrite(aip, AsnFind ("Bioseq-set.level"), &av)) goto erret;
666 }
667 if (bsp->_class != 0)
668 {
669 av.intvalue = bsp->_class;
670 if (! AsnWrite(aip, AsnFind ("Bioseq-set.class"), &av)) goto erret;
671 }
672 if (bsp->release != NULL)
673 {
674 av.ptrvalue = bsp->release;
675 if (! AsnWrite(aip, AsnFind ("Bioseq-set.release"), &av)) goto erret;
676 }
677 if (bsp->date != NULL)
678 {
679 if (! DateAsnWrite(bsp->date, aip, AsnFind ("Bioseq-set.date"))) goto erret;
680 }
681 if (bsp->descr != NULL) /* Seq-descr optional */
682 {
683 if (! SeqDescrAsnWrite(bsp->descr, aip, AsnFind ("Bioseq-set.descr"))) goto erret;
684 }
685
686 if (! AsnOpenStruct(aip, AsnFind ("Bioseq-set.seq-set"), (Pointer)bsp->seq_set)) goto erret;
687 /* this is where we stop */
688 retval = TRUE;
689 erret:
690 AsnUnlinkType(orig); /* unlink local tree */
691 return retval;
692 }
693
694 static Boolean LIBCALL MySeqEntryAsnWrite (SeqEntryPtr sep, AsnIoPtr aip, AsnTypePtr orig)
695 {
696 AsnTypePtr atp;
697 DataVal av;
698 Boolean retval = FALSE;
699
700 if (aip == NULL)
701 return FALSE;
702
703 atp = AsnLinkType(orig, AsnFind ("Seq-entry")); /* link local tree */
704 if (atp == NULL) return FALSE;
705
706 if (sep == NULL) { AsnNullValueMsg(aip, atp); goto erret; }
707
708 av.ptrvalue = (Pointer)sep;
709 if (! AsnWriteChoice(aip, atp, (Int2)sep->choice, &av)) goto erret;
710 if (sep->choice == 1)
711 {
712 if (! BioseqAsnWrite((BioseqPtr)sep->data.ptrvalue, aip, AsnFind ("Seq-entry.seq")))
713 {
714 goto erret;
715 }
716 }
717 else if (sep->choice == 2)
718 {
719 if (! MyBioseqSetAsnWrite((BioseqSetPtr)sep->data.ptrvalue, aip, AsnFind ("Seq-entry.set")))
720 {
721 goto erret;
722 }
723 }
724 /* this is where we stop */
725 retval = TRUE;
726 erret:
727 AsnUnlinkType(orig);
728 return retval;
729 }
730
731
732 static Boolean MySeqSubmitAsnWrite (AsnIoPtr aip, SubmitBlockPtr sbp, SeqDescrPtr desc_list)
733 {
734 DataVal av;
735 AsnTypePtr atp;
736 Boolean retval = FALSE;
737 SeqEntryPtr sep;
738 SeqSubmitPtr ssp = NULL;
739 BioseqSetPtr bssp;
740 SeqDescrPtr sdp, sdp_copy;
741
742 if (aip == NULL)
743 return FALSE;
744
745 atp = AsnLinkType(NULL, AsnFind ("Seq-submit")); /* link local tree */
746 if (atp == NULL)
747 return FALSE;
748
749 ssp = SeqSubmitNew ();
750 ssp->sub = sbp;
751 ssp->datatype = 1;
752 sep = SeqEntryNew ();
753 sep->choice = 2;
754 bssp = BioseqSetNew ();
755 bssp->_class = BioseqseqSet_class_genbank;
756
757 if (desc_list != NULL) {
758 for (sdp = desc_list; sdp != NULL; sdp = sdp->next) {
759 if (sdp->choice == Seq_descr_pub) {
760 sdp_copy = (SeqDescrPtr) AsnIoMemCopy (sdp, (AsnReadFunc) SeqDescrAsnRead, (AsnWriteFunc) SeqDescrAsnWrite);
761 sdp_copy->next = bssp->descr;
762 bssp->descr = sdp_copy;
763 }
764 }
765 }
766
767 sep->data.ptrvalue = bssp;
768 ssp->data = sep;
769
770 if (! AsnOpenStruct(aip, atp, (Pointer)ssp))
771 goto erret;
772
773 if (! SubmitBlockAsnWrite(ssp->sub, aip, AsnFind ("Seq-submit.sub"))) goto erret;
774
775 av.ptrvalue = ssp->data;
776 if (! AsnWriteChoice(aip, AsnFind ("Seq-submit.data"), (Int2)ssp->datatype, &av)) goto erret;
777
778 if (! AsnOpenStruct(aip, AsnFind ("Seq-submit.data.entrys"), ssp->data)) goto erret;
779 sep = (SeqEntryPtr) ssp->data;
780 if (! MySeqEntryAsnWrite(sep, aip, AsnFind ("Seq-submit.data.entrys.E"))) goto erret;
781 /* This is where we stop */
782 retval = TRUE;
783 erret:
784 ssp->sub = NULL;
785 ssp = SeqSubmitFree (ssp);
786 return retval;
787 }
788
789 static void StartSeqSubmit (AsnIoPtr aip, SubmitBlockPtr sbp, SeqDescrPtr desc_list)
790 {
791
792 if (aip == NULL || aip->fp == NULL) {
793 return;
794 }
795
796 if (sbp == NULL) {
797 fprintf (aip->fp, "Seq-entry ::= set {\n");
798 fprintf (aip->fp, "class genbank ,\n");
799 fprintf (aip->fp, "seq-set {\n");
800 } else {
801 MySeqSubmitAsnWrite (aip, sbp, desc_list);
802 AsnIoFlush (aip);
803 }
804 }
805
806
807 static DenseSegPtr DenseSegFromConsensusReadAln (TConsensusReadAlnPtr aln, CharPtr contig_id, CharPtr read_id)
808 {
809 DenseSegPtr dsp;
810 Int4 i;
811
812 if (aln == NULL) {
813 return NULL;
814 }
815
816 dsp = DenseSegNew ();
817 dsp->dim = 2;
818 dsp->numseg = aln->numseg;
819 dsp->ids = MakeSeqID (contig_id);
820 dsp->ids->next = MakeSeqID (read_id);
821 dsp->starts = (Int4Ptr) MemNew (sizeof (Int4) * dsp->dim * dsp->numseg);
822 dsp->lens = (Int4Ptr) MemNew (sizeof (Int4) * dsp->numseg);
823 if (aln->is_complement) {
824 dsp->strands = (Uint1Ptr) MemNew (sizeof (Uint1) * dsp->dim * dsp->numseg);
825 for (i = 0; i < dsp->numseg; i++) {
826 dsp->strands[i * 2] = Seq_strand_plus;
827 dsp->strands[(i * 2) + 1] = Seq_strand_minus;
828 }
829 }
830 for (i = 0; i < dsp->numseg; i++) {
831 dsp->starts[i * 2] = aln->cons_starts[i];
832 dsp->starts[(i * 2) + 1] = aln->read_starts[i];
833 dsp->lens [i] = aln->lens[i];
834 }
835 return dsp;
836 }
837
838
839 static SeqAlignPtr SeqAlignsForConsensusAndReads (TContigPtr contig)
840 {
841 SeqAlignPtr salp_list = NULL, salp_last = NULL, salp_tmp;
842 TConsensusReadAlnPtr aln;
843 DenseSegPtr dsp;
844 Int4 i;
845
846 if (contig == NULL) {
847 return NULL;
848 }
849
850 for (i = 0; i < contig->num_reads; i++) {
851 aln = GetConsensusReadAln (contig->consensus_seq, contig->reads[i]);
852 if (aln != NULL) {
853 dsp = DenseSegFromConsensusReadAln (aln, contig->consensus_id, contig->reads[i]->read_id);
854 if (dsp != NULL) {
855 salp_tmp = SeqAlignNew ();
856 salp_tmp->type = SAT_MASTERSLAVE;
857 salp_tmp->segtype = SAS_DENSEG;
858 salp_tmp->segs = dsp;
859 salp_tmp->dim = 2;
860 if (salp_list == NULL) {
861 salp_list = salp_tmp;
862 } else {
863 salp_last->next = salp_tmp;
864 }
865 salp_last = salp_tmp;
866 }
867 }
868 }
869 return salp_list;
870 }
871
872
873 static SeqEntryPtr MakeContigSeqEntryWithReads (TContigPtr contig)
874 {
875 BioseqSetPtr bssp;
876 SeqEntryPtr sep, sep_prev;
877 Int4 i;
878 SeqAlignPtr salp;
879
880 if (contig == NULL) {
881 return NULL;
882 }
883
884 bssp = BioseqSetNew ();
885 bssp->_class = BioseqseqSet_class_genbank;
886 bssp->seq_set = MakeSeqEntryFromContig (contig);
887 salp = SeqAlignsForConsensusAndReads (contig);
888 if (salp != NULL) {
889 bssp->annot = SeqAnnotNew ();
890 bssp->annot->type = 2;
891 bssp->annot->data = salp;
892 }
893 sep_prev = bssp->seq_set;
894 for (i = 0; i < contig->num_reads; i++) {
895 sep = MakeSeqEntryFromRead (contig->reads[i]);
896 sep_prev->next = sep;
897 sep_prev = sep;
898 }
899 sep = SeqEntryNew ();
900 sep->choice = 2;
901 sep->data.ptrvalue = bssp;
902 return sep;
903 }
904
905
906 static void WriteXMLMsgUnableToOpenFile (CharPtr has_errors, CharPtr filename)
907 {
908 if (has_errors == NULL || filename == NULL) {
909 return;
910 }
911 if (*has_errors == 0) {
912 printf ("<aceread>\n");
913 *has_errors = 1;
914 }
915 printf ("<message severity=\"ERROR\" seq-id=\"No ID\" code=\"bad_format\">Unable to open %s</message>\n", filename);
916 }
917
918
919 typedef struct contigcountcallback {
920 Int4 num_contigs;
921 Uint4 num_conbases;
922 Int4 num_reads;
923 Uint4 num_readbases;
924 Int4 file_num;
925 } ContigCountCallbackData, PNTR ContigCountCallbackPtr;
926
927
928 static ContigCountCallbackPtr ContigCountCallbackNew ()
929 {
930 ContigCountCallbackPtr c;
931
932 c = (ContigCountCallbackPtr) MemNew (sizeof (ContigCountCallbackData));
933 MemSet (c, 0, sizeof (ContigCountCallbackData));
934 return c;
935 }
936
937
938 static ContigCountCallbackPtr SummarizeContigCountList (ValNodePtr list)
939 {
940 ContigCountCallbackPtr summ, c;
941
942 summ = ContigCountCallbackNew();
943 while (list != NULL) {
944 c = (ContigCountCallbackPtr) list->data.ptrvalue;
945 if (c != NULL) {
946 summ->num_contigs += c->num_contigs;
947 summ->num_conbases += c->num_conbases;
948 summ->num_reads += c->num_reads;
949 summ->num_readbases += c->num_readbases;
950 }
951 list = list->next;
952 }
953 return summ;
954 }
955
956
957 typedef struct contigfilelist {
958 ValNodePtr list;
959 Int4 max_bases;
960 ContigCountCallbackPtr current;
961 } ContigFileListData, PNTR ContigFileListPtr;
962
963
964 #define ONE_CONTIG_FOR_FIRST
965
966 static char ProcessContigCountCallback (TContigPtr contig, void *data)
967 {
968 ContigFileListPtr list;
969 Int4 i;
970
971 list = (ContigFileListPtr) data;
972 if (contig == NULL || list == NULL) {
973 return 0;
974 }
975
976 if (list->current == NULL || list->current->num_conbases > list->max_bases
977 #ifdef ONE_CONTIG_FOR_FIRST
978 || list->list->next == NULL
979 #endif
980 ) {
981 list->current = ContigCountCallbackNew();
982 list->current->file_num = ValNodeLen (list->list);
983 ValNodeAddPointer (&(list->list), 0, list->current);
984 }
985
986 list->current->num_contigs++;
987 list->current->num_conbases += contig->consensus_seq_len;
988 list->current->num_reads += contig->num_reads;
989
990 for (i = 0; i < contig->num_reads; i++) {
991 list->current->num_readbases += contig->reads[i]->read_len;
992 }
993 return 1;
994 }
995
996
997 typedef enum {
998 eReadNameType_local = 0,
999 eReadNameType_TI,
1000 eReadNameType_SRR } EReadNameType;
1001
1002 static EReadNameType ReadNameTypeFromArg (CharPtr arg)
1003 {
1004 EReadNameType read_name_type = eReadNameType_local;
1005
1006 if (arg != NULL) {
1007 if (StringNICmp (arg, "T", 1) == 0) {
1008 read_name_type = eReadNameType_TI;
1009 } else if (StringNICmp (arg, "S", 1) == 0) {
1010 read_name_type = eReadNameType_SRR;
1011 }
1012 }
1013 return read_name_type;
1014 }
1015
1016
1017 typedef struct contigcallback {
1018 AsnIoPtr asn1_out;
1019 AsnTypePtr atp;
1020 FILE *fasta_out;
1021 FILE *qual_out;
1022 FILE *xml_out;
1023
1024 ValNodePtr file_counts_list;
1025 Int4 contig_count;
1026
1027 CharPtr fasta_base;
1028 CharPtr asn_base;
1029 CharPtr xml_base;
1030 CharPtr qual_base;
1031
1032 /* XML values */
1033 CharPtr subref;
1034 CharPtr center_name;
1035 Int4 taxid;
1036 CharPtr description;
1037 CharPtr assembly;
1038
1039 Boolean recalculate_consensus;
1040 Boolean recalculate_only_Ns;
1041
1042 Boolean no_lookup;
1043 Boolean is_srr;
1044 Boolean asn1_include_reads;
1045
1046 EReadNameType read_name_type;
1047
1048 SeqIdReplaceListPtr id_replacement_list;
1049
1050 SubmitBlockPtr sbp;
1051 SeqDescrPtr desc_list;
1052
1053 char *has_errors;
1054 } ContigCallbackData, PNTR ContigCallbackPtr;
1055
1056
1057 static AsnIoPtr StartAsnFile (CharPtr filename, SubmitBlockPtr sbp, SeqDescrPtr desc_list)
1058 {
1059 AsnIoPtr aip;
1060
1061 aip = AsnIoOpen (filename, "w");
1062 if (aip != NULL) {
1063 aip->indent_level = 1;
1064 aip->first[aip->indent_level] = FALSE;
1065 StartSeqSubmit (aip, sbp, desc_list);
1066 }
1067 return aip;
1068 }
1069
1070
1071 static AsnIoPtr EndAsnFile (AsnIoPtr aip, Boolean is_submitblock)
1072 {
1073 if (aip != NULL) {
1074 AsnIoFlush (aip);
1075 if (is_submitblock) {
1076 fprintf (aip->fp, " } } } }\n");
1077 } else {
1078 fprintf (aip->fp, " } }\n");
1079 }
1080 AsnIoClose (aip);
1081 aip = NULL;
1082 }
1083 return aip;
1084 }
1085
1086
1087 static char ProcessContigCallback (TContigPtr contig, void *data)
1088 {
1089 ContigCallbackPtr c;
1090 SeqEntryPtr sep;
1091 Char filename[300];
1092 ContigCountCallbackPtr count = NULL;
1093 ValNodePtr tmp;
1094 Boolean write_out = FALSE;
1095 Int4 i, ti;
1096 char rval = 0;
1097
1098 c = (ContigCallbackPtr) data;
1099 if (contig == NULL || c == NULL) {
1100 return 0;
1101 }
1102
1103 if (c->id_replacement_list != NULL) {
1104 UpdateContigIds (contig, c->id_replacement_list, c->no_lookup, c->is_srr, c->has_errors);
1105 }
1106
1107 if (c->read_name_type == eReadNameType_TI) {
1108 for (i = 0; i < contig->num_reads; i++) {
1109 if (contig->reads[i]->read_id != NULL) {
1110 ti = atoi (contig->reads[i]->read_id);
1111 if (ti < 1) {
1112 if (*(c->has_errors) == 0) {
1113 printf ("<aceread>\n");
1114 *(c->has_errors) = 1;
1115 }
1116 printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">Non-integer value for ti</message>\n", contig->reads[i]->read_id);
1117 } else if (contig->reads[i]->ti == 0) {
1118 contig->reads[i]->ti = ti;
1119 free (contig->reads[i]->read_id);
1120 contig->reads[i]->read_id = NULL;
1121 } else if (ti == contig->reads[i]->ti) {
1122 free (contig->reads[i]->read_id);
1123 contig->reads[i]->read_id = NULL;
1124 } else {
1125 if (*(c->has_errors) == 0) {
1126 printf ("<aceread>\n");
1127 *(c->has_errors) = 1;
1128 }
1129 printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">Conflicting values for ti</message>\n", contig->reads[i]->read_id);
1130 }
1131 }
1132 }
1133 } else if (c->read_name_type == eReadNameType_SRR) {
1134 for (i = 0; i < contig->num_reads; i++) {
1135 if (contig->reads[i]->read_id != NULL) {
1136 if (contig->reads[i]->srr == NULL) {
1137 contig->reads[i]->srr = contig->reads[i]->read_id;
1138 contig->reads[i]->read_id = NULL;
1139 } else if (StringCmp (contig->reads[i]->read_id, contig->reads[i]->srr) == 0) {
1140 free (contig->reads[i]->read_id);
1141 contig->reads[i]->read_id = NULL;
1142 } else {
1143 if (*(c->has_errors) == 0) {
1144 printf ("<aceread>\n");
1145 *(c->has_errors) = 1;
1146 }
1147 printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">Conflicting values for srr</message>\n", contig->reads[i]->read_id);
1148 }
1149 }
1150 }
1151 }
1152
1153 if (c->recalculate_consensus) {
1154 /* TODO - add read quality scores ? */
1155
1156 if (ReplaceConsensusSequenceFromTraces (contig, c->recalculate_only_Ns) > 0) {
1157 write_out = TRUE;
1158 }
1159 } else {
1160 write_out = TRUE;
1161 }
1162
1163 c->contig_count ++;
1164
1165 if (write_out) {
1166 rval = 1;
1167 if (c->file_counts_list != NULL) {
1168 count = c->file_counts_list->data.ptrvalue;
1169 }
1170
1171 /* write ASN.1 */
1172 if (c->asn1_out == NULL
1173 && c->asn_base != NULL && count != NULL) {
1174 sprintf (filename, "%s.%d", c->asn_base, count->file_num);
1175 c->asn1_out = StartAsnFile (filename, c->sbp, c->desc_list);
1176 if (c->asn1_out == NULL) {
1177 WriteXMLMsgUnableToOpenFile (c->has_errors, filename);
1178 rval = 0;
1179 }
1180 }
1181 if (c->asn1_out != NULL) {
1182 if (c->asn1_include_reads) {
1183 sep = MakeContigSeqEntryWithReads (contig);
1184 } else {
1185 sep = MakeSeqEntryFromContig (contig);
1186 }
1187 if (c->desc_list != NULL) {
1188 VisitBioseqsInSep (sep, c->desc_list, AddDescrToNucBioseqCallback);
1189 }
1190 SeqEntryAsnWrite(sep, c->asn1_out, c->atp);
1191 sep = SeqEntryFree (sep);
1192 if (count != NULL && c->contig_count >= count->num_contigs) {
1193 c->asn1_out = EndAsnFile (c->asn1_out, c->sbp != NULL);
1194 }
1195 }
1196
1197 /* write FASTA */
1198 if (c->fasta_out == NULL
1199 && c->fasta_base != NULL && count != NULL) {
1200 sprintf (filename, "%s.%d", c->fasta_base, count->file_num);
1201 c->fasta_out = FileOpen (filename, "w");
1202 if (c->fasta_out == NULL) {
1203 WriteXMLMsgUnableToOpenFile (c->has_errors, filename);
1204 rval = 0;
1205 }
1206 }
1207 if (c->fasta_out != NULL) {
1208 WriteFASTAFromContig (contig, c->fasta_out);
1209 if (count != NULL && c->contig_count >= count->num_contigs) {
1210 FileClose (c->fasta_out);
1211 c->fasta_out = NULL;
1212 }
1213 }
1214
1215 /* write quality scores */
1216 if (c->qual_out == NULL
1217 && c->qual_base != NULL && count != NULL) {
1218 sprintf (filename, "%s.%d", c->qual_base, count->file_num);
1219 c->qual_out = FileOpen (filename, "w");
1220 if (c->qual_out == NULL) {
1221 WriteXMLMsgUnableToOpenFile (c->has_errors, filename);
1222 rval = 0;
1223 }
1224 }
1225 if (c->qual_out != NULL) {
1226 WriteContigQualScores (contig, c->qual_out);
1227 if (count != NULL && c->contig_count >= count->num_contigs) {
1228 FileClose (c->qual_out);
1229 c->qual_out = NULL;
1230 }
1231 }
1232
1233 /* write XML */
1234 if (c->xml_out == NULL
1235 && c->xml_base != NULL && count != NULL) {
1236 sprintf (filename, "%s.%d", c->xml_base, count->file_num);
1237 c->xml_out = FileOpen (filename, "w");
1238 WriteTraceAssemblyHeader ("UPDATE", c->subref, c->center_name, c->taxid, c->description, c->assembly,
1239 count->num_contigs, count->num_conbases, count->num_reads, count->num_readbases,
1240 c->xml_out);
1241
1242 if (c->xml_out == NULL) {
1243 WriteXMLMsgUnableToOpenFile (c->has_errors, filename);
1244 rval = 0;
1245 }
1246 }
1247 if (c->xml_out != NULL) {
1248 WriteTraceAssemblyFromContig (contig, c->xml_out);
1249 if (count != NULL && c->contig_count >= count->num_contigs) {
1250 WriteTraceAssemblyTrailer (c->xml_out);
1251 FileClose (c->xml_out);
1252 c->xml_out = NULL;
1253 }
1254 }
1255 }
1256
1257 if (count != NULL && c->contig_count >= count->num_contigs) {
1258 tmp = c->file_counts_list;
1259 c->file_counts_list = tmp->next;
1260 tmp->next = NULL;
1261 tmp = ValNodeFreeData (tmp);
1262 c->contig_count = 0;
1263 }
1264
1265 return 1;
1266 }
1267
1268
1269 static BioSourcePtr BioSourceDescriptorFromTaxId (Int4 taxid)
1270 {
1271 BioSourcePtr biop = NULL;
1272 DbtagPtr dbtag;
1273
1274 if (taxid > 0) {
1275 biop = BioSourceNew ();
1276 biop->org = OrgRefNew();
1277 dbtag = DbtagNew ();
1278 dbtag->db = StringSave ("taxon");
1279 dbtag->tag = ObjectIdNew ();
1280 dbtag->tag->id = taxid;
1281 ValNodeAddPointer (&(biop->org->db), 0, dbtag);
1282 }
1283 return biop;
1284 }
1285
1286
1287 static void ReadLargeAceFile
1288 (CharPtr acefile,
1289 CharPtr asn1_out,
1290 CharPtr fasta_out,
1291 CharPtr template_in,
1292 CharPtr qual_scores_out,
1293 CharPtr xml_out,
1294 CharPtr id_lookup,
1295 char *has_errors,
1296 Boolean recalculate_consensus,
1297 Boolean recalculate_only_Ns,
1298 CharPtr subref,
1299 CharPtr center_name,
1300 Int4 taxid,
1301 CharPtr description,
1302 CharPtr assembly,
1303 Boolean no_lookup,
1304 Boolean is_srr,
1305 Boolean make_qual_scores,
1306 Int4 chunk_size,
1307 EReadNameType read_name_type,
1308 Boolean include_reads)
1309 {
1310 ReadBufferData rbd;
1311 ContigCallbackData c;
1312 SeqEntryPtr old_scope;
1313 FILE *f;
1314 SeqSubmitPtr ssp = NULL;
1315 CitSubPtr csp;
1316 Pointer dataptr;
1317 Uint2 datatype;
1318 SeqDescrPtr sdp, sdp_next;
1319 ContigFileListData file_count_list;
1320 ContigCountCallbackPtr summ;
1321 Boolean has_source = FALSE;
1322
1323 MemSet (&c, 0, sizeof (ContigCallbackData));
1324
1325 c.no_lookup = no_lookup;
1326 c.is_srr = is_srr;
1327 c.has_errors = has_errors;
1328 c.asn1_include_reads = include_reads;
1329 c.read_name_type = read_name_type;
1330
1331 /* filenames */
1332 c.asn_base = asn1_out;
1333 c.asn1_out = NULL;
1334 c.fasta_base = fasta_out;
1335 c.fasta_out = NULL;
1336 c.qual_base = qual_scores_out;
1337 c.qual_out = NULL;
1338 c.xml_base = xml_out;
1339 c.xml_out = NULL;
1340
1341 /* XML values */
1342 c.subref = subref;
1343 c.center_name = center_name;
1344 c.taxid = taxid;
1345 c.description = description;
1346 c.assembly = assembly;
1347
1348 file_count_list.list = NULL;
1349 file_count_list.current = NULL;
1350 file_count_list.max_bases = chunk_size;
1351
1352 rbd.fp = OpenAceFile (acefile);
1353 if (rbd.fp == NULL) {
1354 WriteXMLMsgUnableToOpenFile (c.has_errors, acefile);
1355 goto escape;
1356 }
1357 rbd.current_data = NULL;
1358
1359 ProcessLargeACEFileForContigFastaAndQualScores ( AbstractReadFunction, &rbd,
1360 qual_scores_out == NULL ? make_qual_scores : TRUE,
1361 has_errors, ProcessContigCountCallback, &file_count_list);
1362
1363 FileClose (rbd.fp);
1364 rbd.fp = NULL;
1365
1366 /* prepare XML output */
1367 if (c.xml_base != NULL) {
1368 if (chunk_size < 1) {
1369 summ = SummarizeContigCountList (file_count_list.list);
1370 c.xml_out = FileOpen (c.xml_base, "w");
1371 if (c.xml_out == NULL) {
1372 WriteXMLMsgUnableToOpenFile (c.has_errors, c.xml_base);
1373 goto escape;
1374 }
1375 WriteTraceAssemblyHeader ("NEW", c.subref, c.center_name, c.taxid, c.description, c.assembly,
1376 summ->num_contigs, summ->num_conbases, summ->num_reads, summ->num_readbases,
1377 c.xml_out);
1378 summ = MemFree (summ);
1379 file_count_list.list = ValNodeFreeData (file_count_list.list);
1380 } else {
1381 #ifdef ONE_CONTIG_FOR_FIRST
1382 /* temporarily, start the first file instead, which will have just one contig */
1383 c.xml_out = FileOpen (c.xml_base, "w");
1384 if (c.xml_out == NULL) {
1385 WriteXMLMsgUnableToOpenFile (c.has_errors, c.xml_base);
1386 goto escape;
1387 }
1388 summ = (ContigCountCallbackPtr) file_count_list.list->data.ptrvalue;
1389 WriteTraceAssemblyHeader ("NEW", c.subref, c.center_name, c.taxid, c.description, c.assembly,
1390 summ->num_contigs, summ->num_conbases, summ->num_reads, summ->num_readbases,
1391 c.xml_out);
1392 #else
1393 f = FileOpen (c.xml_base, "w");
1394 if (f == NULL) {
1395 WriteXMLMsgUnableToOpenFile (c.has_errors, c.xml_base);
1396 goto escape;
1397 }
1398 WriteTraceAssemblyHeader ("NEW", c.subref, c.center_name, c.taxid, c.description, c.assembly,
1399 0, 0, 0, 0,
1400 f);
1401 WriteTraceAssemblyTrailer (f);
1402 FileClose (f);
1403 #endif
1404 }
1405 } else {
1406 if (chunk_size < 1) {
1407 file_count_list.list = ValNodeFreeData (file_count_list.list);
1408 }
1409 }
1410
1411 c.file_counts_list = file_count_list.list;
1412
1413 /* read template file */
1414 c.sbp = NULL;
1415 c.desc_list = NULL;
1416
1417 if (!StringHasNoText (template_in)) {
1418 f = FileOpen (template_in, "r");
1419 if (f == NULL) {
1420 WriteXMLMsgUnableToOpenFile (c.has_errors, template_in);
1421 goto escape;
1422 }
1423 while ((dataptr = ReadAsnFastaOrFlatFile (f, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
1424 if (datatype == OBJ_SEQSUB) {
1425 ssp = (SeqSubmitPtr) dataptr;
1426 c.sbp = ssp->sub;
1427 ssp->sub = NULL;
1428 ssp = SeqSubmitFree (ssp);
1429 } else if (datatype == OBJ_SUBMIT_BLOCK) {
1430 c.sbp = (SubmitBlockPtr) dataptr;
1431 } else if (datatype == OBJ_SEQDESC) {
1432 sdp = (SeqDescrPtr) dataptr;
1433 if (sdp->choice == Seq_descr_source) {
1434 has_source = TRUE;
1435 }
1436 ValNodeLink (&(c.desc_list), (ValNodePtr) dataptr);
1437 } else {
1438 ObjMgrFree (datatype, dataptr);
1439 }
1440 }
1441 FileClose (f);
1442 if (c.sbp != NULL) {
1443 c.sbp->tool = MemFree (c.sbp->tool);
1444 c.sbp->tool = StringSave ("aceread");
1445 c.sbp->hup = FALSE;
1446 c.sbp->reldate = DateFree (c.sbp->reldate);
1447 csp = c.sbp->cit;
1448 if (csp != NULL) {
1449 csp->date = DateFree (csp->date);
1450 csp->date = DateCurr ();
1451 }
1452 }
1453 }
1454
1455 if (taxid > 0 && !has_source) {
1456 /* tax lookup? */
1457 sdp = SeqDescrNew (NULL);
1458 sdp->choice = Seq_descr_source;
1459 sdp->data.ptrvalue = BioSourceDescriptorFromTaxId (taxid);
1460 ValNodeLink (&(c.desc_list), (ValNodePtr) sdp);
1461 }
1462
1463 c.atp = AsnFind ("Bioseq-set.seq-set.E");
1464
1465 c.recalculate_consensus = recalculate_consensus;
1466 c.recalculate_only_Ns = recalculate_only_Ns;
1467
1468 if (id_lookup != NULL) {
1469 f = FileOpen (id_lookup, "r");
1470 if (f == NULL) {
1471 WriteXMLMsgUnableToOpenFile (c.has_errors, id_lookup);
1472 goto escape;
1473 }
1474 c.id_replacement_list = ReadSeqIdPairListFromFile (f);
1475 SeqEntrySetScope (old_scope);
1476 FileClose (f);
1477 }
1478
1479 if (chunk_size < 1) {
1480 if (c.asn_base != NULL) {
1481 c.asn1_out = StartAsnFile (c.asn_base, c.sbp, c.desc_list);
1482 if (c.asn1_out == NULL) {
1483 WriteXMLMsgUnableToOpenFile (c.has_errors, c.asn_base);
1484 goto escape;
1485 }
1486 }
1487 if (c.fasta_base != NULL) {
1488 c.fasta_out = FileOpen (c.fasta_base, "w");
1489 if (c.fasta_out == NULL) {
1490 WriteXMLMsgUnableToOpenFile (c.has_errors, c.fasta_base);
1491 goto escape;
1492 }
1493 }
1494 if (c.qual_out != NULL) {
1495 c.qual_out = FileOpen (c.qual_base, "w");
1496 if (c.qual_out == NULL) {
1497 WriteXMLMsgUnableToOpenFile (c.has_errors, c.qual_base);
1498 goto escape;
1499 }
1500 }
1501 }
1502
1503 rbd.fp = OpenAceFile (acefile);
1504 if (rbd.fp == NULL) {
1505 WriteXMLMsgUnableToOpenFile (c.has_errors, acefile);
1506 goto escape;
1507 }
1508 rbd.current_data = NULL;
1509
1510 ProcessLargeACEFileForContigFastaAndQualScores ( AbstractReadFunction, &rbd,
1511 qual_scores_out == NULL ? FALSE : TRUE,
1512 has_errors, ProcessContigCallback, &c);
1513
1514
1515 escape:
1516 FileClose (rbd.fp);
1517 c.id_replacement_list = SeqIdReplaceListFree (c.id_replacement_list);
1518 /* free c.desc_list */
1519 for (sdp = c.desc_list; sdp != NULL; sdp = sdp_next) {
1520 sdp_next = sdp->next;
1521 sdp->next = NULL;
1522 sdp = SeqDescrFree (sdp);
1523 }
1524 if (c.xml_out != NULL) {
1525 WriteTraceAssemblyTrailer (c.xml_out);
1526 FileClose (c.xml_out);
1527 c.xml_out = NULL;
1528 }
1529 if (c.asn1_out != NULL) {
1530 c.asn1_out = EndAsnFile (c.asn1_out, c.sbp != NULL);
1531 }
1532 if (c.fasta_out != NULL) {
1533 FileClose (c.fasta_out);
1534 c.fasta_out = NULL;
1535 }
1536 if (c.qual_out != NULL) {
1537 FileClose (c.qual_out);
1538 c.qual_out = NULL;
1539 }
1540 c.sbp = SubmitBlockFree (c.sbp);
1541 }
1542
1543
1544 Int2 Main (void)
1545
1546 {
1547 CharPtr infile, outfile, xmlfile;
1548
1549 ReadBufferData rbd;
1550 TACEFilePtr afp;
1551 Int4 i, len;
1552 SeqEntryPtr sep;
1553 AsnIoPtr aip;
1554 FILE *f = NULL;
1555 FILE *f2;
1556 CharPtr app = "aceread_tst";
1557 BioseqSetPtr bssp;
1558 SeqEntryPtr last_sep = NULL;
1559 Uint2 entityID;
1560 Boolean make_qual_scores, suppress_lookup, srr_ids, fasta_out;
1561 CharPtr submitter_ref = NULL, archive_id = NULL, description = NULL, assembly = NULL;
1562 CharPtr center_name = NULL;
1563 CharPtr format = NULL;
1564 CharPtr gap_string;
1565 CharPtr asn_file = NULL;
1566 Int4 limit = 0;
1567 char has_errors = 0;
1568 Boolean recalculate_consensus = FALSE, recalculate_only_Ns = FALSE;
1569 CharPtr recalculate_options;
1570 SeqSubmitPtr ssp;
1571 CharPtr id_substitution_file = NULL;
1572 Int4 taxon_id = 0;
1573
1574 /* standard setup */
1575
1576 ErrSetFatalLevel (SEV_MAX);
1577 ErrSetMessageLevel (SEV_MAX);
1578 ErrClearOptFlags (EO_SHOW_USERSTR);
1579 ErrSetLogfile ("stderr", ELOG_APPEND);
1580 ErrSetOpts (ERR_IGNORE, ERR_LOG_ON);
1581
1582 UseLocalAsnloadDataAndErrMsg ();
1583 ErrPathReset ();
1584
1585 if (! AllObjLoad ()) {
1586 Message (MSG_FATAL, "AllObjLoad failed");
1587 return 1;
1588 }
1589 if (! SubmitAsnLoad ()) {
1590 Message (MSG_FATAL, "SubmitAsnLoad failed");
1591 return 1;
1592 }
1593 if (! FeatDefSetLoad ()) {
1594 Message (MSG_FATAL, "FeatDefSetLoad failed");
1595 return 1;
1596 }
1597 PubSeqFetchEnable ();
1598
1599 if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
1600 return 0;
1601 }
1602
1603 recalculate_options = (CharPtr) myargs[N_argRecalculateConsensus].strvalue;
1604 if (!StringHasNoText (recalculate_options)) {
1605 if (StringCmp (recalculate_options, "W") == 0) {
1606 recalculate_consensus = TRUE;
1607 recalculate_only_Ns = FALSE;
1608 } else if (StringCmp (recalculate_options, "N") == 0) {
1609 recalculate_consensus = TRUE;
1610 recalculate_only_Ns = TRUE;
1611 } else {
1612 Message (MSG_FATAL, "Invalid consensus sequence recalculation option");
1613 return 1;
1614 }
1615 }
1616
1617
1618 /* test gap info reading if provided */
1619 gap_string = (CharPtr) myargs[G_argGapString].strvalue;
1620 TestGapInfoReading (gap_string);
1621
1622 /* limit number of contigs? for debugging purposes */
1623 limit = myargs[l_argLimitNumContigs].intvalue;
1624
1625 /* select format of input file */
1626 format = (CharPtr) myargs[F_argFormat].strvalue;
1627 if (StringHasNoText (format)) {
1628 format = "A";
1629 }
1630
1631 infile = (CharPtr) myargs [i_argInputFile].strvalue;
1632 if (StringHasNoText (infile)) {
1633 Message (MSG_FATAL, "Must supply input file!");
1634 return 1;
1635 }
1636 outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
1637 xmlfile = (CharPtr) myargs[X_argXMLFile].strvalue;
1638 make_qual_scores = (Boolean) myargs [Q_argMakeQualScores].intvalue;
1639 center_name = (CharPtr) myargs[C_argCenter].strvalue;
1640 suppress_lookup = (Boolean) myargs [L_argSuppressIdLookup].intvalue;
1641 srr_ids = (Boolean) myargs[R_argSRRids].intvalue;
1642 fasta_out = (Boolean) myargs[f_argFASTA].intvalue;
1643
1644 /* ASN.1 file to validate against */
1645 asn_file = (CharPtr) myargs [V_argValidateAgainstAsn1File].strvalue;
1646
1647 if (!GetTSAFieldsFromString ((CharPtr) myargs [T_argTSAFields].strvalue,
1648 &submitter_ref,
1649 &archive_id,
1650 &description,
1651 &assembly,
1652 &taxon_id)) {
1653 Message (MSG_FATAL, "Error reading TSA fields");
1654 return 1;
1655 }
1656
1657 if (!StringHasNoText (xmlfile) && (StringHasNoText (center_name) || taxon_id < 1)) {
1658 PrintACEFormatErrorXML ("Must specify center name and taxid for XML output", NULL, &has_errors);
1659 printf ("</aceread>\n");
1660 return 1;
1661 }
1662
1663 len = StringLen (infile);
1664 if (StringHasNoText (outfile)) {
1665 if (len > 3 && StringCmp (infile + len - 4, ".ace") == 0) {
1666 outfile = StringSave (infile);
1667 StringCpy (outfile + len - 3, "sqn");
1668 } else if (len > 6 && StringCmp (infile + len - 7, ".ace.gz") == 0) {
1669 outfile = StringSave (infile);
1670 StringCpy (outfile + len - 6, "sqn");
1671 } else {
1672 outfile = (CharPtr) MemNew (sizeof (Char) * (len + 5));
1673 sprintf (outfile, "%s.sqn", infile);
1674 }
1675 }
1676
1677 if (!StringHasNoText ((CharPtr) myargs [S_argIDSubstitutionFile].strvalue)) {
1678 id_substitution_file = ((CharPtr) myargs [S_argIDSubstitutionFile].strvalue);
1679 }
1680
1681 if (StringChr (format, 'A') != NULL) {
1682 ReadLargeAceFile (infile, fasta_out ? NULL : outfile,
1683 fasta_out ? outfile : NULL,
1684 (CharPtr) myargs[t_argTemplateFile].strvalue,
1685 NULL, xmlfile, id_substitution_file, &has_errors,
1686 recalculate_consensus, recalculate_only_Ns,
1687 submitter_ref, center_name, taxon_id, description, assembly,
1688 suppress_lookup, srr_ids, make_qual_scores,
1689 myargs[c_argChunkSize].intvalue,
1690 ReadNameTypeFromArg (myargs[n_argReadNameType].strvalue),
1691 (Boolean) myargs [z_argIncludeReads].intvalue);
1692 if (has_errors) {
1693 printf ("</aceread>\n");
1694 return 1;
1695 } else {
1696 return 0;
1697 }
1698 }
1699
1700 if (id_substitution_file != NULL) {
1701 f = FileOpen (id_substitution_file, "r");
1702 if (f == NULL) {
1703 Message (MSG_FATAL, "Unable to open %s", id_substitution_file);
1704 return 1;
1705 }
1706 }
1707
1708 if (StringChr (format, 'M') != NULL) {
1709 rbd.fp = FileOpen (infile, "r");
1710 if (rbd.fp == NULL) {
1711 Message (MSG_FATAL, "Unable to open %s", infile);
1712 return 1;
1713 }
1714
1715 rbd.current_data = NULL;
1716 afp = ReadMAQFile (AbstractReadFunction, &rbd);
1717 } else if (StringChr (format, 'E') != NULL) {
1718 rbd.fp = FileOpen (infile, "r");
1719 if (rbd.fp == NULL) {
1720 Message (MSG_FATAL, "Unable to open %s", infile);
1721 return 1;
1722 }
1723
1724 rbd.current_data = NULL;
1725 afp = ReadElandStandaloneFile (AbstractReadFunction, &rbd);
1726 } else if (StringChr (format, 'A') != NULL) {
1727 rbd.fp = OpenAceFile (infile);
1728 if (rbd.fp == NULL) {
1729 Message (MSG_FATAL, "Unable to open %s", infile);
1730 return 1;
1731 }
1732 rbd.current_data = NULL;
1733 afp = ReadACEFile ( AbstractReadFunction, &rbd, make_qual_scores, &has_errors);
1734 } else {
1735 Message (MSG_FATAL, "Unrecognized format: %s\n", format);
1736 return 1;
1737 }
1738 FileClose (rbd.fp);
1739 if (afp == NULL) {
1740 printf ("<message severity=\"ERROR\" seq-id=\"No ID\" code=\"bad_format\">Unable to read file</message>\n");
1741 } else {
1742 if (recalculate_consensus) {
1743 if (!AddReadQualityScores (afp, (CharPtr) myargs [q_argReadQualScoresFile].strvalue, (CharPtr) myargs [r_argReadFASTAFile].strvalue)) {
1744 printf ("<message severity=\"ERROR\" seq-id=\"No ID\" code=\"bad_format\">Failed to add read quality scores</message>\n");
1745 } else {
1746 RecalculateConsensusSequences (afp, recalculate_only_Ns);
1747 }
1748 }
1749
1750 if (limit > 0) {
1751 for (i = limit; i < afp->num_contigs; i++) {
1752 ContigFree (afp->contigs[i]);
1753 afp->contigs[i] = NULL;
1754 }
1755 afp->num_contigs = limit;
1756 }
1757
1758 if (f != NULL) {
1759 UpdateAceFileIds (afp, f, suppress_lookup, srr_ids, &has_errors);
1760 FileClose (f);
1761 f = NULL;
1762 }
1763 ValidateAceFileIds (afp, &has_errors);
1764
1765 if (asn_file != NULL) {
1766 if (ValidateAgainstASNFile (afp, asn_file, &has_errors)) {
1767 printf ("Validation against %s succeeded\n", asn_file);
1768 }
1769 }
1770
1771 if (!StringHasNoText (xmlfile)) {
1772 f2 = FileOpen (xmlfile, "w");
1773 WriteTraceAssemblyFromAceFile (afp, submitter_ref, center_name, 0, description, f2);
1774 FileClose (f2);
1775 }
1776
1777 if (fasta_out) {
1778 f2 = FileOpen (outfile, "w");
1779 WriteFASTAFromAceFile (afp, f2);
1780 FileClose (f2);
1781 } else {
1782 aip = AsnIoOpen (outfile, "w");
1783 if (aip == NULL) {
1784 printf ("Unable to open %s\n", outfile);
1785 } else {
1786 bssp = BioseqSetNew ();
1787 bssp->_class = BioseqseqSet_class_genbank;
1788
1789 for (i = 0; i < afp->num_contigs; i++) {
1790 sep = MakeSeqEntryFromContig (afp->contigs[i]);
1791 if (last_sep == NULL) {
1792 bssp->seq_set = sep;
1793 } else {
1794 last_sep->next = sep;
1795 }
1796 last_sep = sep;
1797 }
1798 sep = ValNodeNew (NULL);
1799 sep->choice = 2;
1800 sep->data.ptrvalue = bssp;
1801 bssp->seqentry = sep;
1802 SeqMgrLinkSeqEntry (sep, 0, NULL);
1803 entityID = ObjMgrGetEntityIDForChoice (sep);
1804 AssignIDsInEntityEx (entityID, 0, NULL, NULL);
1805 SeqMgrIndexFeatures (entityID, sep);
1806 ssp = AddSeqSubmitFromTemplate (sep, (CharPtr) myargs[t_argTemplateFile].strvalue);
1807 if (ssp == NULL) {
1808 SeqEntryAsnWrite (sep, aip, NULL);
1809 sep = SeqEntryFree (sep);
1810 } else {
1811 SeqSubmitAsnWrite (ssp, aip, NULL);
1812 ssp = SeqSubmitFree (ssp);
1813 }
1814 AsnIoClose (aip);
1815 }
1816 }
1817 }
1818
1819 if (has_errors) {
1820 printf ("</aceread>\n");
1821 }
1822
1823 return 0;
1824
1825 }
1826
1827 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |