NCBI C Toolkit Cross Reference

C/demo/aceread_tst.c


  1 /*   aceread_tst.c
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *            National Center for Biotechnology Information (NCBI)
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's official duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government do not place any restriction on its use or reproduction.
 13 *  We would, however, appreciate having the NCBI and the author cited in
 14 *  any work or product based on this material
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 * ===========================================================================
 25 *
 26 * File Name:  aceread_tst.c
 27 *
 28 * Author:  Colleen Bollin
 29 *
 30 * Version Creation Date:   7/22/08
 31 *
 32 * $Revision: 1.27 $
 33 *
 34 * File Description: 
 35 *
 36 * Modifications:  
 37 * --------------------------------------------------------------------------
 38 * Date     Name        Description of modification
 39 * -------  ----------  -----------------------------------------------------
 40 *
 41 *
 42 * ==========================================================================
 43 */
 44 
 45 #include <ncbi.h>
 46 #include <objall.h>
 47 #include <objsset.h>
 48 #include <objsub.h>
 49 #include <objfdef.h>
 50 #include <seqport.h>
 51 #include <sequtil.h>
 52 #include <sqnutils.h>
 53 #include <subutil.h>
 54 #include <gather.h>
 55 #include <explore.h>
 56 #include <lsqfetch.h>
 57 #include <valid.h>
 58 #include <pmfapi.h>
 59 #ifdef INTERNAL_NCBI_ASNDISC
 60 #include <accpubseq.h>
 61 #include <tax3api.h>
 62 #endif
 63 
 64 #include "aceread.h"
 65 #include "acerdapi.h"
 66 
 67 typedef enum {
 68   i_argInputFile,
 69   o_argOutputFile,
 70   f_argFASTA,
 71   S_argIDSubstitutionFile,
 72   R_argSRRids,
 73   L_argSuppressIdLookup,
 74   Q_argMakeQualScores,
 75   X_argXMLFile,
 76   t_argTemplateFile,
 77   T_argTSAFields,
 78   C_argCenter,
 79   F_argFormat,
 80   G_argGapString,
 81   V_argValidateAgainstAsn1File,
 82   q_argReadQualScoresFile,
 83   r_argReadFASTAFile,
 84   N_argRecalculateConsensus,
 85   c_argChunkSize,
 86   n_argReadNameType,
 87   z_argIncludeReads,
 88   l_argLimitNumContigs
 89 } EArgNum;
 90 
 91 Args myargs [] = {
 92   {"Single Input File", "stdin", NULL, NULL,
 93     TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
 94   {"Single Output File", NULL, NULL, NULL,
 95     TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
 96   {"FASTA Output", "F", NULL, NULL,
 97     TRUE, 'f', ARG_BOOLEAN, 0.0, 0, NULL},
 98   {"ID Substitution File", "", NULL, NULL,
 99     TRUE, 'S', ARG_FILE_IN, 0.0, 0, NULL},
100   {"Replacement IDs are SRR", "F", NULL, NULL,
101     TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
102   {"Suppress ID Lookup", "F", NULL, NULL,
103     TRUE, 'L', ARG_BOOLEAN, 0.0, 0, NULL},
104   {"Make Qual Scores", "T", NULL, NULL,
105     TRUE, 'Q', ARG_BOOLEAN, 0.0, 0, NULL},
106   {"XML Output File", "", NULL, NULL,
107     TRUE, 'X', ARG_FILE_OUT, 0.0, 0, NULL },
108   {"Template File", "", NULL, NULL,
109     TRUE, 't', ARG_FILE_IN, 0.0, 0, NULL },
110   {"TSA fields", NULL, NULL, NULL,
111     TRUE, 'T', ARG_STRING, 0.0, 0, NULL },
112   {"Genome Center Tag", NULL, NULL, NULL,
113     TRUE, 'C', ARG_STRING, 0.0, 0, NULL},
114   {"Assembly Format\n\tM MAQ\n\tE Standalone Eland\n\tA ACE", "A", NULL, NULL,
115     TRUE, 'F', ARG_STRING, 0.0, 0, NULL},
116   {"Gap String", NULL, NULL, NULL,
117     TRUE, 'G', ARG_STRING, 0.0, 0, NULL},
118   {"ASN.1 File to validate against", NULL, NULL, NULL,
119     TRUE, 'V', ARG_FILE_IN, 0.0, 0, NULL},
120   {"Quality score file for read sequences", NULL, NULL, NULL,
121     TRUE, 'q', ARG_FILE_IN, 0.0, 0, NULL},
122   {"FASTA file for read sequences (to use when trimming read quality scores)", NULL, NULL, NULL,
123     TRUE, 'r', ARG_FILE_IN, 0.0, 0, NULL},
124   {"Recalculate consensus sequence using read data\n\tW Whole Consensus\n\tN Ns Only", "", NULL, NULL,
125     TRUE, 'N', ARG_STRING, 0.0, 0, NULL},
126   {"Number of contig bases per file", "50000", NULL, NULL,
127     TRUE, 'c', ARG_INT, 0.0, 0, NULL},
128   {"Read name type in ACE file\n\tL local trace name\n\tT TI number\n\tS SRR ID\n", "L", NULL, NULL,
129     TRUE, 'n', ARG_STRING, 0.0, 0, NULL},
130   {"Include read sequences in ASN.1 output", "F", NULL, NULL,
131     TRUE, 'z', ARG_BOOLEAN, 0.0, 0, NULL},
132   {"Limit number of contigs to read", NULL, NULL, NULL,
133     TRUE, 'l', ARG_INT, 0.0, 0, NULL},
134 };
135 
136 
137 static FILE *OpenAceFile (CharPtr infile)
138 {
139   FILE        *f;
140   Int4        len;
141 #ifdef OS_UNIX
142   Char            cmmd [256];
143   CharPtr         gzcatprog;
144   int             ret;
145   Boolean         usedPopen = FALSE;
146 #endif
147 
148   len = StringLen (infile);
149   if (StringCmp (infile + len - 3, ".gz") == 0) {
150 #ifdef OS_UNIX
151     gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
152     if (gzcatprog != NULL) {
153       sprintf (cmmd, "%s %s", gzcatprog, infile);
154     } else {
155       ret = system ("gzcat -h >/dev/null 2>&1");
156       if (ret == 0) {
157         sprintf (cmmd, "gzcat %s", infile);
158       } else if (ret == -1) {
159         Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
160         return NULL;
161       } else {
162         ret = system ("zcat -h >/dev/null 2>&1");
163         if (ret == 0) {
164           sprintf (cmmd, "zcat %s", infile);
165         } else if (ret == -1) {
166           Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease");
167           return NULL;
168         } else {
169           Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
170           return NULL;
171         }
172       }
173     }
174     f = popen (cmmd, "r");
175     usedPopen = TRUE;
176 #else
177     Message (MSG_POSTERR, "Unable to read gzipped files when not running in UNIX");
178     return NULL;
179 #endif
180   } else {
181     f = FileOpen (infile, "r");
182   }
183   return f;
184 }
185 
186 
187 static Boolean ValidateAgainstASNFile (TACEFilePtr ace_file, CharPtr filename, char *has_errors)
188 {
189   Pointer      dataptr;
190   Uint2        datatype;
191   SeqEntryPtr  sep = NULL;
192   SeqSubmitPtr ssp = NULL;
193   Boolean      chars_stripped = FALSE;
194   FILE *fp;
195   Boolean      rval = FALSE;
196   
197 
198   fp = FileOpen (filename, "r");
199   if (fp == NULL) {
200     printf ("Unable to open %s\n", filename);
201     return FALSE;
202   }
203 
204   /* Read in one sequence from the file */
205   dataptr = ReadAsnFastaOrFlatFileEx (fp, &datatype, NULL, FALSE, FALSE,
206                                                           TRUE, FALSE, &chars_stripped);      
207   FileClose (fp);
208   if (NULL == dataptr) 
209   {
210     printf ("Unable to read SeqEntry from %s\n", filename);
211     return FALSE;
212   }
213 
214   /* Convert the file data to a SeqEntry */
215   
216   if (datatype == OBJ_SEQENTRY)
217     sep = (SeqEntryPtr) dataptr;
218   else if (datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET)
219     sep = SeqMgrGetSeqEntryForData (dataptr);
220   else if (datatype == OBJ_SEQSUB) 
221   {
222     ssp = (SeqSubmitPtr) dataptr;
223     if (ssp != NULL && ssp->datatype == 1)
224     {
225       sep = (SeqEntryPtr) ssp->data;
226     }
227   }
228   
229   rval = ValidateACEFileAgainstSeqEntry (ace_file, sep, has_errors);
230 
231   if (ssp != NULL) {
232     ssp = SeqSubmitFree (ssp);
233   } else {
234     sep = SeqEntryFree (sep);
235   }
236   return rval;
237  
238 }
239 
240 
241 static Boolean StringNHasNoText (CharPtr str, Int4 n)
242 {
243   CharPtr cp;
244   Int4    i;
245   if (str == NULL) return TRUE;
246   cp = str;
247   i = 0;
248   while (i < n) {
249     if (*cp == 0) return TRUE;
250     if (!isspace (*cp)) return FALSE;
251     cp++;
252     i++;
253   }
254   return TRUE;
255 }
256 
257 
258 static Boolean BracketMatchesLabel (CharPtr cp, CharPtr cp_equal, CharPtr label) 
259 {
260   Int4 len;
261 
262   if (cp == NULL || cp_equal == NULL || label == NULL) return FALSE;
263 
264   len = StringLen (label);
265   if (StringNCmp (cp, label, len) == 0 
266         && StringNHasNoText (cp + len, cp_equal - cp - len)) {
267     return TRUE;
268   } else {
269     return FALSE;
270   }
271 }
272 
273 
274 static CharPtr GetBracketValue (CharPtr cp, CharPtr cp_end)
275 {
276   Int4 len;
277   CharPtr val = NULL;
278 
279   if (cp == NULL || cp_end == NULL || cp_end <= cp) return NULL;
280 
281   cp += StringSpn (cp, " \t");
282   len = (cp_end - cp) + 1;
283   val = (CharPtr) MemNew (sizeof (Char) * len);
284   StringNCpy (val, cp, len - 1); 
285   val [len] = 0;
286   while (len > 1 && isspace (val [len-1])) {
287     len--;
288     val[len] = 0;
289   }
290   return val;
291 }
292 
293 
294 static Boolean
295 GetTSAFieldsFromString
296 (CharPtr str,
297  CharPtr PNTR p_submitter_reference,
298  CharPtr PNTR p_archive_id,
299  CharPtr PNTR p_description,
300  CharPtr PNTR p_assembly,
301  Int4Ptr p_taxon_id)
302 {
303   CharPtr cp, cp_next, cp_equal, cp_end;
304   CharPtr subref = NULL, arch_id = NULL, desc = NULL, assembly = NULL, tmp;
305   Boolean is_bad = FALSE;
306 
307   if (p_submitter_reference != NULL) {
308     *p_submitter_reference = NULL;
309   }
310   if (p_archive_id != NULL) {
311     *p_archive_id = NULL;
312   }
313   if (p_submitter_reference != NULL) {
314     *p_description = NULL;
315   }
316   if (StringHasNoText (str)) {
317     return TRUE;
318   }
319 
320   cp = StringChr (str, '[');
321   while (cp != NULL && !is_bad) {
322     cp++;
323     cp_next = StringChr (cp + 1, '[');
324     cp_equal = StringChr (cp, '=');
325     cp_end = StringChr (cp, ']');
326     if (cp_equal == NULL || cp_end == NULL) {
327       is_bad = TRUE;
328     } else if (cp_equal > cp_end) {
329       is_bad = TRUE;
330     } else if (cp_next != NULL && (cp_equal > cp_next || cp_end > cp_next)) {
331       is_bad = TRUE;
332     } else {
333       cp += StringSpn (cp, " \t");
334       if (BracketMatchesLabel (cp, cp_equal, "subref")) {
335         if (subref == NULL) {
336           subref = GetBracketValue (cp_equal + 1, cp_end);
337         } else {
338           is_bad = TRUE;
339         }
340       } else if (BracketMatchesLabel (cp, cp_equal, "archive_id")) {
341         if (arch_id == NULL) {
342           arch_id = GetBracketValue (cp_equal + 1, cp_end);
343         } else {
344           is_bad = TRUE;
345         }
346       } else if (BracketMatchesLabel (cp, cp_equal, "desc")) {
347         if (desc == NULL) {
348           desc = GetBracketValue (cp_equal + 1, cp_end);
349         } else {
350           is_bad = TRUE;
351         }
352       } else if (BracketMatchesLabel (cp, cp_equal, "assembly")) {
353         if (assembly == NULL) {
354           assembly = GetBracketValue (cp_equal + 1, cp_end);
355         } else {
356           is_bad = TRUE;
357         }
358       } else if (BracketMatchesLabel (cp, cp_equal, "taxon_id")) {
359         tmp = GetBracketValue (cp_equal + 1, cp_end);
360         if (p_taxon_id != NULL) {
361           *p_taxon_id = atoi (tmp);
362         }
363       } else {
364         is_bad = TRUE;
365       }
366     }
367     cp = cp_next;
368   }
369   if (p_submitter_reference == NULL) {
370     subref = MemFree (subref);
371   } else {
372     *p_submitter_reference = subref;
373   }
374   if (p_archive_id == NULL) {
375     arch_id = MemFree (arch_id);
376   } else {
377     *p_archive_id = arch_id;
378   }
379   if (p_description == NULL) {
380     desc = MemFree (desc);
381   } else {
382     *p_description = desc;
383   }
384   if (p_assembly == NULL) {
385     assembly = MemFree (assembly);
386   } else {
387     *p_assembly = assembly;
388   }
389   return TRUE;
390 }
391 
392 
393 static void PrintTraceGapsXML (TGapInfoPtr gap_info)
394 {
395   Int4 i;
396 
397   if (gap_info != NULL) {
398     printf ("    <ntracegaps>%d</ntracegaps>\n", gap_info->num_gaps);
399     if (gap_info->num_gaps > 0) {
400       printf ("      <tracegaps source=\"INLINE\">");
401       for (i = 0; i < gap_info->num_gaps - 1; i++) {
402         printf ("%d,", gap_info->gap_offsets[i]);
403       }
404       printf ("%d</tracegaps>\n", gap_info->gap_offsets[gap_info->num_gaps - 1]);
405     }
406   }
407 }
408 
409 
410 static void TestPosConversions (TGapInfoPtr gap_info)
411 {
412   Int4 i, t_pos, s_pos = 0, r_pos;
413   Int4 test_len = 0;
414 
415   if (gap_info != NULL && gap_info->num_gaps > 0) {
416     for (i = 0; i < gap_info->num_gaps; i++) {
417       test_len += gap_info->gap_offsets[i] + 1;
418     }
419     for (i = 0; i < test_len; i++) {
420       s_pos = SeqPosFromTilingPos (i, gap_info);
421       t_pos = TilingPosFromSeqPos (s_pos, gap_info);
422       if (t_pos != i) {
423         printf ("Failed!  %d -> SeqPosFromTilingPos -> %d -> TilingPosFromSeqPos -> %d\n",
424                 i, s_pos, t_pos);
425       }
426       r_pos = SeqPosFromTilingPos (t_pos, gap_info);
427       if (r_pos != s_pos) {
428         printf ("Failed!  %d -> TilingPosFromSeqPos -> %d -> SeqPosFromTilingPos -> %d\n",
429                 s_pos, t_pos, r_pos);
430       }
431       /* printf ("%d:%d:%d:%d\n", i, s_pos, t_pos, r_pos); */
432     }
433   }
434 }
435 
436 
437 static void PrintTraceReadXML (TContigReadPtr read)
438 {
439   if (read == NULL) {
440     printf ("Bad read\n");
441   } else {
442     printf ("<trace>\n");
443     printf ("  <trace_name>%s</trace_name>\n", read->read_id == NULL ? "" : read->read_id);
444     PrintTraceGapsXML (read->gaps);
445     printf ("  <nbasecalls>%d</nbasecalls>\n", StringLen (read->read_seq));
446     printf ("  <valid>\n");
447     printf ("    <start>%d</start>\n", read->read_assem_start + 1);
448     printf ("    <stop>%d</stop>\n", read->read_assem_stop + 1);
449     printf ("  </valid>\n");
450     printf ("  <tiling direction = \"%s\">\n", read->is_complement ? "REVERSE" : "FORWARD");
451     printf ("    <start>%d</start>\n", read->cons_start + 1);
452     printf ("    <start>%d</start>\n", read->cons_start + StringLen (read->read_seq) + 1);
453     printf ("  </tiling>\n");
454     printf ("  <consensus>\n");
455     printf ("    <start>%d</start>\n", read->cons_start + 1);
456     printf ("    <start>%d</start>\n", read->cons_start + StringLen (read->read_seq) + 1);
457     printf ("  </consensus>\n");
458     printf ("<trace>\n");
459   }
460 }
461 
462 
463 
464 static void TestGapInfoReading (CharPtr gap_string)
465 {
466   TGapInfoPtr  gap_info;
467   ValNodePtr   list, vnp;
468   
469   if (!StringHasNoText (gap_string)) {
470     gap_info = GapInfoFromSequenceString(gap_string, "*");
471     if (gap_info == NULL) {
472       printf ("error reading");
473     } else {
474       PrintTraceGapsXML (gap_info);
475       TestPosConversions (gap_info);
476       list = GetTransitionsFromGapInfo (gap_info, 0, 0, 40);
477       for (vnp = list; vnp != NULL; vnp = vnp->next) {
478         printf ("%d\n", vnp->data.intvalue);
479       }
480     }
481     GapInfoFree (gap_info);
482   }
483 }
484 
485 
486 static void AddAlignmentToSeqEntry (DenseSegPtr dsp, SeqEntryPtr sep)
487 {
488   SeqAnnotPtr  sap;
489   SeqAlignPtr  salp;
490   BioseqPtr    bsp;
491   BioseqSetPtr bssp;
492 
493   if (dsp == NULL || sep == NULL) return;
494 
495   sap = SeqAnnotNew ();
496   sap->type = 2;
497 
498   salp = SeqAlignNew ();
499   salp->type = 3;
500   salp->segtype = 2;
501   salp->segs = (Pointer) dsp;
502   salp->dim = dsp->dim;
503   sap->data = (Pointer) salp;
504 
505   if (IS_Bioseq (sep)) {
506     bsp = (BioseqPtr) sep->data.ptrvalue;
507     sap->next = bsp->annot;
508     bsp->annot = sap;
509   } else if (IS_Bioseq_set (sep)) {
510     bssp = (BioseqSetPtr) sep->data.ptrvalue;
511     sap->next = bssp->annot;
512     bssp->annot = sap;
513   }
514 }
515 
516 
517 static void AddDescrToNucBioseqCallback (BioseqPtr bsp, Pointer data)
518 {
519   SeqDescrPtr sdp, sdp_copy;
520 
521   if (bsp == NULL || !ISA_na (bsp->mol) || data == NULL) { 
522     return;
523   }
524   sdp = (SeqDescrPtr) data;
525   while (sdp != NULL) {
526     if (sdp->choice != Seq_descr_pub) {
527       sdp_copy = (SeqDescrPtr) AsnIoMemCopy (sdp, (AsnReadFunc) SeqDescrAsnRead, (AsnWriteFunc) SeqDescrAsnWrite);
528       sdp_copy->next = bsp->descr;
529       bsp->descr = sdp_copy;
530     }
531     sdp = sdp->next;
532   }
533 }
534 
535   
536 static SeqSubmitPtr AddSeqSubmitFromTemplate (SeqEntryPtr sep, CharPtr filename)
537 {
538   SeqSubmitPtr   ssp = NULL;
539   SubmitBlockPtr sbp;
540   CitSubPtr      csp;
541   FILE *fp = NULL;
542   Pointer         dataptr;
543   Uint2           datatype;
544 
545   if (StringHasNoText (filename)) {
546     return NULL;
547   }
548     
549   fp = FileOpen (filename, "r");
550   if (fp == NULL) {
551     printf ("Unable to read template file %s\n", filename);
552     return NULL;
553   }
554 
555   while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
556     if (datatype == OBJ_SEQSUB) {
557       ssp = (SeqSubmitPtr) dataptr;
558       ssp->datatype = 1;
559       ssp->data = sep;
560     } else if (datatype == OBJ_SUBMIT_BLOCK) {
561       sbp = (SubmitBlockPtr) dataptr;
562       ssp = SeqSubmitNew ();
563       ssp->datatype = 1;
564       ssp->data = sep;
565       ssp->sub = sbp;
566     } else if (datatype == OBJ_SEQDESC) {
567       VisitBioseqsInSep (sep, dataptr, AddDescrToNucBioseqCallback);
568       ObjMgrFree (datatype, dataptr);
569     } else {
570       ObjMgrFree (datatype, dataptr);
571     }
572   }
573   FileClose (fp);
574   if (ssp == NULL) {
575     ssp = SeqSubmitNew ();
576     ssp->datatype = 1;
577     ssp->data = sep;
578   }
579 
580   if (ssp->sub == NULL) {
581     ssp->sub = SubmitBlockNew ();
582   } 
583 
584   ssp->sub->tool = MemFree (ssp->sub->tool);
585   ssp->sub->tool = StringSave ("aceread");
586   ssp->sub->hup = FALSE;
587   ssp->sub->reldate = DateFree (ssp->sub->reldate);
588   csp = ssp->sub->cit;
589   if (csp != NULL) {
590     csp->date = DateFree (csp->date);
591     csp->date = DateCurr ();
592   }
593   return ssp;
594 }
595 
596 
597 static Boolean AddReadQualityScores (TACEFilePtr afp, CharPtr qs_filename, CharPtr rd_filename)
598 {
599   ReadBufferData q, r;
600   Boolean use_fasta = FALSE;
601   Boolean rval = FALSE;
602 
603   if (afp == NULL || StringHasNoText (qs_filename)) {
604     return TRUE;
605   }
606 
607   q.current_data = NULL;
608   r.current_data = NULL;
609 
610   q.fp = FileOpen (qs_filename, "r");
611   if (q.fp == NULL) {
612     printf ("Unable to read quality score file\n");
613     return FALSE;
614   }
615 
616   if (!StringHasNoText (rd_filename)) {
617     r.fp = FileOpen (rd_filename, "r");
618     if (r.fp == NULL) {
619       printf ("Unable to open read FASTA file\n");
620       FileClose (q.fp);
621       return FALSE;
622     }
623     use_fasta = TRUE;
624   }
625 
626   if (AddReadQualScores (afp, AbstractReadFunction, &q, use_fasta ? AbstractReadFunction : NULL, &r) > 0) {
627     rval = TRUE;
628   }
629 
630   FileClose (q.fp);
631   if (use_fasta) {
632     FileClose (r.fp);
633   }
634   return rval;
635 }
636 
637 
638 static Boolean LIBCALL MyBioseqSetAsnWrite (BioseqSetPtr bsp, AsnIoPtr aip, AsnTypePtr orig)
639 {
640         DataVal av;
641         AsnTypePtr atp;
642         Boolean retval = FALSE;
643 
644         if (aip == NULL)
645                 return FALSE;
646 
647         atp = AsnLinkType(orig, AsnFind ("Bioseq-set"));   /* link local tree */
648         if (atp == NULL) return FALSE;
649 
650         if (bsp == NULL) { AsnNullValueMsg(aip, atp); goto erret; }
651 
652         if (! AsnOpenStruct(aip, atp, (Pointer)bsp)) goto erret;
653     
654   if (bsp->id != NULL)
655         {
656     if (! ObjectIdAsnWrite(bsp->id, aip, AsnFind ("Bioseq-set.id"))) goto erret;
657         }
658   if (bsp->coll != NULL)
659         {
660     if (! DbtagAsnWrite(bsp->coll, aip, AsnFind ("Bioseq-set.coll"))) goto erret;
661         }
662   if (bsp->level != INT2_MIN)
663   {
664     av.intvalue = bsp->level;
665     if (! AsnWrite(aip, AsnFind ("Bioseq-set.level"), &av)) goto erret;
666   }
667   if (bsp->_class != 0)
668   {
669     av.intvalue = bsp->_class;
670     if (! AsnWrite(aip, AsnFind ("Bioseq-set.class"), &av)) goto erret;
671   }
672   if (bsp->release != NULL)
673   {
674     av.ptrvalue = bsp->release;
675     if (! AsnWrite(aip, AsnFind ("Bioseq-set.release"), &av)) goto erret;
676   }
677   if (bsp->date != NULL)
678         {
679     if (! DateAsnWrite(bsp->date, aip, AsnFind ("Bioseq-set.date"))) goto erret;
680         }
681   if (bsp->descr != NULL)              /* Seq-descr optional */
682         {
683     if (! SeqDescrAsnWrite(bsp->descr, aip, AsnFind ("Bioseq-set.descr"))) goto erret;
684         }
685 
686   if (! AsnOpenStruct(aip, AsnFind ("Bioseq-set.seq-set"), (Pointer)bsp->seq_set)) goto erret;
687   /* this is where we stop */
688   retval = TRUE;
689 erret:
690         AsnUnlinkType(orig);        /* unlink local tree */
691         return retval;
692 }
693 
694 static Boolean LIBCALL MySeqEntryAsnWrite (SeqEntryPtr sep, AsnIoPtr aip, AsnTypePtr orig)
695 {
696   AsnTypePtr atp;
697         DataVal av;
698         Boolean retval = FALSE;
699 
700         if (aip == NULL)
701                 return FALSE;
702 
703         atp = AsnLinkType(orig, AsnFind ("Seq-entry"));   /* link local tree */
704         if (atp == NULL) return FALSE;
705 
706         if (sep == NULL) { AsnNullValueMsg(aip, atp); goto erret; }
707 
708         av.ptrvalue = (Pointer)sep;
709   if (! AsnWriteChoice(aip, atp, (Int2)sep->choice, &av)) goto erret;
710   if (sep->choice == 1)
711         {
712     if (! BioseqAsnWrite((BioseqPtr)sep->data.ptrvalue, aip, AsnFind ("Seq-entry.seq"))) 
713     {
714                         goto erret;
715     }
716         }
717   else if (sep->choice == 2)
718         {
719     if (! MyBioseqSetAsnWrite((BioseqSetPtr)sep->data.ptrvalue, aip, AsnFind ("Seq-entry.set")))
720     {
721                         goto erret;
722     }
723         }
724   /* this is where we stop */
725         retval = TRUE;
726 erret:
727   AsnUnlinkType(orig);
728   return retval;
729 }
730 
731 
732 static Boolean MySeqSubmitAsnWrite (AsnIoPtr aip, SubmitBlockPtr sbp, SeqDescrPtr desc_list)
733 {
734         DataVal av;
735         AsnTypePtr atp;
736   Boolean retval = FALSE;
737         SeqEntryPtr sep;
738   SeqSubmitPtr ssp = NULL;
739   BioseqSetPtr bssp;
740   SeqDescrPtr  sdp, sdp_copy;
741 
742         if (aip == NULL)
743                 return FALSE;
744 
745         atp = AsnLinkType(NULL, AsnFind ("Seq-submit"));   /* link local tree */
746   if (atp == NULL)
747     return FALSE;
748 
749   ssp = SeqSubmitNew ();
750   ssp->sub = sbp;
751   ssp->datatype = 1;
752   sep = SeqEntryNew ();
753   sep->choice = 2;
754   bssp = BioseqSetNew ();
755   bssp->_class = BioseqseqSet_class_genbank;
756 
757   if (desc_list != NULL) {
758     for (sdp = desc_list; sdp != NULL; sdp = sdp->next) {
759       if (sdp->choice == Seq_descr_pub) {
760         sdp_copy = (SeqDescrPtr) AsnIoMemCopy (sdp, (AsnReadFunc) SeqDescrAsnRead, (AsnWriteFunc) SeqDescrAsnWrite);
761         sdp_copy->next = bssp->descr;
762         bssp->descr = sdp_copy;
763       }
764     }
765   }
766 
767   sep->data.ptrvalue = bssp;
768   ssp->data = sep;
769 
770   if (! AsnOpenStruct(aip, atp, (Pointer)ssp))
771     goto erret;
772 
773         if (! SubmitBlockAsnWrite(ssp->sub, aip, AsnFind ("Seq-submit.sub"))) goto erret;
774 
775         av.ptrvalue = ssp->data;
776   if (! AsnWriteChoice(aip, AsnFind ("Seq-submit.data"), (Int2)ssp->datatype, &av)) goto erret;
777 
778         if (! AsnOpenStruct(aip, AsnFind ("Seq-submit.data.entrys"), ssp->data)) goto erret;
779         sep = (SeqEntryPtr) ssp->data;
780         if (! MySeqEntryAsnWrite(sep, aip, AsnFind ("Seq-submit.data.entrys.E"))) goto erret;
781   /* This is where we stop */
782   retval = TRUE;
783 erret:
784   ssp->sub = NULL;
785   ssp = SeqSubmitFree (ssp);
786         return retval;
787 }
788 
789 static void StartSeqSubmit (AsnIoPtr aip, SubmitBlockPtr sbp, SeqDescrPtr desc_list)
790 {
791 
792   if (aip == NULL || aip->fp == NULL) {
793     return;
794   }
795 
796   if (sbp == NULL) {
797     fprintf (aip->fp, "Seq-entry ::= set {\n");
798     fprintf (aip->fp, "class genbank ,\n");
799     fprintf (aip->fp, "seq-set {\n");
800   } else {
801     MySeqSubmitAsnWrite (aip, sbp, desc_list);
802     AsnIoFlush (aip);
803   }
804 }
805 
806 
807 static DenseSegPtr DenseSegFromConsensusReadAln (TConsensusReadAlnPtr aln, CharPtr contig_id, CharPtr read_id) 
808 {
809   DenseSegPtr dsp;
810   Int4        i;
811 
812   if (aln == NULL) {
813     return NULL;
814   }
815 
816   dsp = DenseSegNew ();
817   dsp->dim = 2;
818   dsp->numseg = aln->numseg;
819   dsp->ids = MakeSeqID (contig_id);
820   dsp->ids->next = MakeSeqID (read_id);
821   dsp->starts = (Int4Ptr) MemNew (sizeof (Int4) * dsp->dim * dsp->numseg);
822   dsp->lens = (Int4Ptr) MemNew (sizeof (Int4) * dsp->numseg);
823   if (aln->is_complement) {
824     dsp->strands = (Uint1Ptr) MemNew (sizeof (Uint1) * dsp->dim * dsp->numseg);
825     for (i = 0; i < dsp->numseg; i++) {
826       dsp->strands[i * 2] = Seq_strand_plus;
827       dsp->strands[(i * 2) + 1] = Seq_strand_minus;
828     }
829   }
830   for (i = 0; i < dsp->numseg; i++) {
831     dsp->starts[i * 2] = aln->cons_starts[i];
832     dsp->starts[(i * 2) + 1] = aln->read_starts[i];
833     dsp->lens [i] = aln->lens[i];
834   }
835   return dsp;
836 }
837 
838 
839 static SeqAlignPtr SeqAlignsForConsensusAndReads (TContigPtr contig)
840 {
841   SeqAlignPtr salp_list = NULL, salp_last = NULL, salp_tmp;
842   TConsensusReadAlnPtr aln;
843   DenseSegPtr dsp;
844   Int4 i;
845 
846   if (contig == NULL) {
847     return NULL;
848   }
849 
850   for (i = 0; i < contig->num_reads; i++) {
851     aln = GetConsensusReadAln (contig->consensus_seq, contig->reads[i]);
852     if (aln != NULL) {
853       dsp = DenseSegFromConsensusReadAln (aln, contig->consensus_id, contig->reads[i]->read_id);
854       if (dsp != NULL) {
855         salp_tmp = SeqAlignNew ();
856         salp_tmp->type = SAT_MASTERSLAVE;
857         salp_tmp->segtype = SAS_DENSEG;
858         salp_tmp->segs = dsp;
859         salp_tmp->dim = 2;
860         if (salp_list == NULL) {
861           salp_list = salp_tmp;
862         } else {
863           salp_last->next = salp_tmp;
864         }
865         salp_last = salp_tmp;
866       }
867     }
868   }
869   return salp_list;
870 }
871 
872 
873 static SeqEntryPtr MakeContigSeqEntryWithReads (TContigPtr contig)
874 {
875   BioseqSetPtr bssp;
876   SeqEntryPtr  sep, sep_prev;
877   Int4 i;
878   SeqAlignPtr salp;
879 
880   if (contig == NULL) {
881     return NULL;
882   }
883 
884   bssp = BioseqSetNew ();
885   bssp->_class = BioseqseqSet_class_genbank;
886   bssp->seq_set = MakeSeqEntryFromContig (contig);
887   salp = SeqAlignsForConsensusAndReads (contig);
888   if (salp != NULL) {
889     bssp->annot = SeqAnnotNew ();
890     bssp->annot->type = 2;
891     bssp->annot->data = salp;
892   }
893   sep_prev = bssp->seq_set;
894   for (i = 0; i < contig->num_reads; i++) {
895     sep = MakeSeqEntryFromRead (contig->reads[i]);
896     sep_prev->next = sep;
897     sep_prev = sep;
898   }
899   sep = SeqEntryNew ();
900   sep->choice = 2;
901   sep->data.ptrvalue = bssp;
902   return sep;
903 }
904 
905 
906 static void WriteXMLMsgUnableToOpenFile (CharPtr has_errors, CharPtr filename)
907 {
908   if (has_errors == NULL || filename == NULL) {
909     return;
910   }
911   if (*has_errors == 0) {
912     printf ("<aceread>\n");
913     *has_errors = 1;
914   }
915   printf ("<message severity=\"ERROR\" seq-id=\"No ID\" code=\"bad_format\">Unable to open %s</message>\n", filename);
916 }
917 
918 
919 typedef struct contigcountcallback {
920   Int4 num_contigs;
921   Uint4 num_conbases;
922   Int4 num_reads;
923   Uint4 num_readbases;
924   Int4  file_num;
925 } ContigCountCallbackData, PNTR ContigCountCallbackPtr;
926 
927 
928 static ContigCountCallbackPtr ContigCountCallbackNew ()
929 {
930   ContigCountCallbackPtr c;
931 
932   c = (ContigCountCallbackPtr) MemNew (sizeof (ContigCountCallbackData));
933   MemSet (c, 0, sizeof (ContigCountCallbackData));
934   return c;
935 }
936 
937 
938 static ContigCountCallbackPtr SummarizeContigCountList (ValNodePtr list)
939 {
940   ContigCountCallbackPtr summ, c;
941   
942   summ = ContigCountCallbackNew();
943   while (list != NULL) {
944     c = (ContigCountCallbackPtr) list->data.ptrvalue;
945     if (c != NULL) {
946       summ->num_contigs += c->num_contigs;
947       summ->num_conbases += c->num_conbases;
948       summ->num_reads += c->num_reads;
949       summ->num_readbases += c->num_readbases;
950     }
951     list = list->next;
952   }
953   return summ;
954 }
955 
956 
957 typedef struct contigfilelist {
958   ValNodePtr list;
959   Int4 max_bases;
960   ContigCountCallbackPtr current;
961 } ContigFileListData, PNTR ContigFileListPtr;
962 
963 
964 #define ONE_CONTIG_FOR_FIRST
965 
966 static char ProcessContigCountCallback (TContigPtr contig, void *data)
967 {
968   ContigFileListPtr list;
969   Int4 i;
970 
971   list = (ContigFileListPtr) data;
972   if (contig == NULL || list == NULL) {
973     return 0;
974   }
975 
976   if (list->current == NULL || list->current->num_conbases > list->max_bases
977 #ifdef ONE_CONTIG_FOR_FIRST
978       || list->list->next == NULL
979 #endif
980     ) {
981     list->current = ContigCountCallbackNew();
982     list->current->file_num = ValNodeLen (list->list);
983     ValNodeAddPointer (&(list->list), 0, list->current);
984   }
985 
986   list->current->num_contigs++;
987   list->current->num_conbases += contig->consensus_seq_len;
988   list->current->num_reads += contig->num_reads;
989 
990   for (i = 0; i < contig->num_reads; i++) {
991     list->current->num_readbases += contig->reads[i]->read_len;
992   }
993   return 1;
994 }
995 
996 
997 typedef enum {
998   eReadNameType_local = 0,
999   eReadNameType_TI,
1000   eReadNameType_SRR } EReadNameType;
1001 
1002 static EReadNameType ReadNameTypeFromArg (CharPtr arg)
1003 {
1004   EReadNameType read_name_type = eReadNameType_local;
1005 
1006   if (arg != NULL) {
1007     if (StringNICmp (arg, "T", 1) == 0) {
1008       read_name_type = eReadNameType_TI;
1009     } else if (StringNICmp (arg, "S", 1) == 0) {
1010       read_name_type = eReadNameType_SRR;
1011     }
1012   }
1013   return read_name_type;
1014 }
1015 
1016 
1017 typedef struct contigcallback {
1018   AsnIoPtr asn1_out;
1019   AsnTypePtr atp;
1020   FILE *fasta_out;
1021   FILE *qual_out;
1022   FILE *xml_out;
1023 
1024   ValNodePtr file_counts_list;
1025   Int4 contig_count;
1026 
1027   CharPtr fasta_base;
1028   CharPtr asn_base;
1029   CharPtr xml_base;
1030   CharPtr qual_base;
1031 
1032   /* XML values */
1033   CharPtr subref;
1034   CharPtr center_name;
1035   Int4    taxid;
1036   CharPtr description;
1037   CharPtr assembly;
1038 
1039   Boolean recalculate_consensus;
1040   Boolean recalculate_only_Ns;
1041 
1042   Boolean no_lookup;
1043   Boolean is_srr;
1044   Boolean asn1_include_reads;
1045 
1046   EReadNameType read_name_type;
1047 
1048   SeqIdReplaceListPtr id_replacement_list;
1049 
1050   SubmitBlockPtr sbp;
1051   SeqDescrPtr desc_list;
1052 
1053   char *has_errors;
1054 } ContigCallbackData, PNTR ContigCallbackPtr;
1055 
1056 
1057 static AsnIoPtr StartAsnFile (CharPtr filename, SubmitBlockPtr sbp, SeqDescrPtr desc_list)
1058 {
1059   AsnIoPtr aip;
1060 
1061   aip = AsnIoOpen (filename, "w");
1062   if (aip != NULL) {
1063     aip->indent_level = 1;
1064     aip->first[aip->indent_level] = FALSE;
1065     StartSeqSubmit (aip, sbp, desc_list);
1066   }
1067   return aip;
1068 }
1069 
1070 
1071 static AsnIoPtr EndAsnFile (AsnIoPtr aip, Boolean is_submitblock)
1072 {
1073   if (aip != NULL) {
1074     AsnIoFlush (aip);
1075     if (is_submitblock) {
1076       fprintf (aip->fp, " } } } }\n");
1077     } else {
1078       fprintf (aip->fp, " } }\n");
1079     }
1080     AsnIoClose (aip);
1081     aip = NULL;
1082   }
1083   return aip;
1084 }
1085 
1086 
1087 static char ProcessContigCallback (TContigPtr contig, void *data)
1088 {
1089   ContigCallbackPtr c;
1090   SeqEntryPtr       sep;
1091   Char              filename[300];
1092   ContigCountCallbackPtr count = NULL;
1093   ValNodePtr             tmp;
1094   Boolean write_out = FALSE;
1095   Int4    i, ti;
1096   char rval = 0;
1097 
1098   c = (ContigCallbackPtr) data;
1099   if (contig == NULL || c == NULL) {
1100     return 0;
1101   }
1102 
1103   if (c->id_replacement_list != NULL) {
1104     UpdateContigIds (contig, c->id_replacement_list, c->no_lookup, c->is_srr, c->has_errors);
1105   }
1106 
1107   if (c->read_name_type == eReadNameType_TI) {
1108     for (i = 0; i < contig->num_reads; i++) {
1109       if (contig->reads[i]->read_id != NULL) {
1110         ti = atoi (contig->reads[i]->read_id);
1111         if (ti < 1) {
1112           if (*(c->has_errors) == 0) {
1113             printf ("<aceread>\n");
1114             *(c->has_errors) = 1;
1115           }
1116           printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">Non-integer value for ti</message>\n", contig->reads[i]->read_id);
1117         } else if (contig->reads[i]->ti == 0) {
1118           contig->reads[i]->ti = ti;
1119           free (contig->reads[i]->read_id);
1120           contig->reads[i]->read_id = NULL;
1121         } else if (ti == contig->reads[i]->ti) {
1122           free (contig->reads[i]->read_id);
1123           contig->reads[i]->read_id = NULL;
1124         } else {
1125           if (*(c->has_errors) == 0) {
1126             printf ("<aceread>\n");
1127             *(c->has_errors) = 1;
1128           }
1129           printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">Conflicting values for ti</message>\n", contig->reads[i]->read_id);
1130         }
1131       }
1132     }
1133   } else if (c->read_name_type == eReadNameType_SRR) {
1134     for (i = 0; i < contig->num_reads; i++) {
1135       if (contig->reads[i]->read_id != NULL) {
1136         if (contig->reads[i]->srr == NULL) {
1137           contig->reads[i]->srr = contig->reads[i]->read_id;
1138           contig->reads[i]->read_id = NULL;
1139         } else if (StringCmp (contig->reads[i]->read_id, contig->reads[i]->srr) == 0) {
1140           free (contig->reads[i]->read_id);
1141           contig->reads[i]->read_id = NULL;
1142         } else {
1143           if (*(c->has_errors) == 0) {
1144             printf ("<aceread>\n");
1145             *(c->has_errors) = 1;
1146           }
1147           printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">Conflicting values for srr</message>\n", contig->reads[i]->read_id);
1148         }
1149       }
1150     }
1151   }
1152 
1153   if (c->recalculate_consensus) {
1154     /* TODO - add read quality scores ? */
1155 
1156     if (ReplaceConsensusSequenceFromTraces (contig, c->recalculate_only_Ns) > 0) {
1157       write_out = TRUE;
1158     }
1159   } else {
1160     write_out = TRUE;
1161   }
1162 
1163   c->contig_count ++;
1164 
1165   if (write_out) {
1166     rval = 1;
1167     if (c->file_counts_list != NULL) {
1168       count = c->file_counts_list->data.ptrvalue;
1169     }
1170 
1171     /* write ASN.1 */
1172     if (c->asn1_out == NULL 
1173         && c->asn_base != NULL && count != NULL) {
1174       sprintf (filename, "%s.%d", c->asn_base, count->file_num);
1175       c->asn1_out = StartAsnFile (filename, c->sbp, c->desc_list);
1176       if (c->asn1_out == NULL) {
1177         WriteXMLMsgUnableToOpenFile (c->has_errors, filename);
1178         rval = 0;
1179       }
1180     }
1181     if (c->asn1_out != NULL) {
1182       if (c->asn1_include_reads) {
1183         sep = MakeContigSeqEntryWithReads (contig);
1184       } else {
1185         sep = MakeSeqEntryFromContig (contig);
1186       }
1187       if (c->desc_list != NULL) {
1188         VisitBioseqsInSep (sep, c->desc_list, AddDescrToNucBioseqCallback);
1189       }
1190       SeqEntryAsnWrite(sep, c->asn1_out, c->atp);
1191       sep = SeqEntryFree (sep);
1192       if (count != NULL && c->contig_count >= count->num_contigs) {
1193         c->asn1_out = EndAsnFile (c->asn1_out, c->sbp != NULL);
1194       }
1195     }
1196     
1197     /* write FASTA */
1198     if (c->fasta_out == NULL
1199         && c->fasta_base != NULL && count != NULL) {
1200       sprintf (filename, "%s.%d", c->fasta_base, count->file_num);
1201       c->fasta_out = FileOpen (filename, "w");
1202       if (c->fasta_out == NULL) {
1203         WriteXMLMsgUnableToOpenFile (c->has_errors, filename);
1204         rval = 0;
1205       }
1206     }
1207     if (c->fasta_out != NULL) {
1208       WriteFASTAFromContig (contig, c->fasta_out);
1209       if (count != NULL && c->contig_count >= count->num_contigs) {
1210         FileClose (c->fasta_out);
1211         c->fasta_out = NULL;
1212       }
1213     }
1214 
1215     /* write quality scores */
1216     if (c->qual_out == NULL
1217         && c->qual_base != NULL && count != NULL) {
1218       sprintf (filename, "%s.%d", c->qual_base, count->file_num);
1219       c->qual_out = FileOpen (filename, "w");
1220       if (c->qual_out == NULL) {
1221         WriteXMLMsgUnableToOpenFile (c->has_errors, filename);
1222         rval = 0;
1223       }
1224     }
1225     if (c->qual_out != NULL) {
1226       WriteContigQualScores (contig, c->qual_out);
1227       if (count != NULL && c->contig_count >= count->num_contigs) {
1228         FileClose (c->qual_out);
1229         c->qual_out = NULL;
1230       }
1231     }
1232 
1233     /* write XML */
1234     if (c->xml_out == NULL
1235         && c->xml_base != NULL && count != NULL) {
1236       sprintf (filename, "%s.%d", c->xml_base, count->file_num);
1237       c->xml_out = FileOpen (filename, "w");
1238       WriteTraceAssemblyHeader ("UPDATE", c->subref, c->center_name, c->taxid, c->description, c->assembly,
1239                                 count->num_contigs, count->num_conbases, count->num_reads, count->num_readbases,
1240                                 c->xml_out);
1241 
1242       if (c->xml_out == NULL) {
1243         WriteXMLMsgUnableToOpenFile (c->has_errors, filename);
1244         rval = 0;
1245       }
1246     }
1247     if (c->xml_out != NULL) {
1248       WriteTraceAssemblyFromContig (contig, c->xml_out);
1249       if (count != NULL && c->contig_count >= count->num_contigs) {
1250         WriteTraceAssemblyTrailer (c->xml_out);
1251         FileClose (c->xml_out);
1252         c->xml_out = NULL;
1253       }
1254     }
1255   }
1256 
1257   if (count != NULL && c->contig_count >= count->num_contigs) {
1258     tmp = c->file_counts_list;
1259     c->file_counts_list = tmp->next;
1260     tmp->next = NULL;
1261     tmp = ValNodeFreeData (tmp);
1262     c->contig_count = 0;
1263   }
1264 
1265   return 1;
1266 }
1267 
1268 
1269 static BioSourcePtr BioSourceDescriptorFromTaxId (Int4 taxid)
1270 {
1271   BioSourcePtr biop = NULL;
1272   DbtagPtr dbtag;
1273 
1274   if (taxid > 0) {
1275     biop = BioSourceNew ();
1276     biop->org = OrgRefNew();
1277     dbtag = DbtagNew ();
1278     dbtag->db = StringSave ("taxon");
1279     dbtag->tag = ObjectIdNew ();
1280     dbtag->tag->id = taxid;
1281     ValNodeAddPointer (&(biop->org->db), 0, dbtag);
1282   }
1283   return biop;
1284 }
1285 
1286 
1287 static void ReadLargeAceFile 
1288 (CharPtr acefile,
1289  CharPtr asn1_out,
1290  CharPtr fasta_out,
1291  CharPtr template_in,
1292  CharPtr qual_scores_out,
1293  CharPtr xml_out,
1294  CharPtr id_lookup,
1295  char *has_errors,
1296  Boolean recalculate_consensus,
1297  Boolean recalculate_only_Ns,
1298  CharPtr subref,
1299  CharPtr center_name,
1300  Int4    taxid,
1301  CharPtr description,
1302  CharPtr assembly,
1303  Boolean no_lookup,
1304  Boolean is_srr,
1305  Boolean make_qual_scores,
1306  Int4    chunk_size,
1307  EReadNameType read_name_type,
1308  Boolean include_reads)
1309 {
1310   ReadBufferData    rbd;
1311   ContigCallbackData c;
1312   SeqEntryPtr old_scope;
1313   FILE *f;
1314   SeqSubmitPtr   ssp = NULL;
1315   CitSubPtr      csp;
1316   Pointer         dataptr;
1317   Uint2           datatype;
1318   SeqDescrPtr     sdp, sdp_next;
1319   ContigFileListData file_count_list;
1320   ContigCountCallbackPtr summ;
1321   Boolean                has_source = FALSE;
1322 
1323   MemSet (&c, 0, sizeof (ContigCallbackData));
1324 
1325   c.no_lookup = no_lookup;
1326   c.is_srr = is_srr;
1327   c.has_errors = has_errors;
1328   c.asn1_include_reads = include_reads;
1329   c.read_name_type = read_name_type;
1330 
1331   /* filenames */
1332   c.asn_base = asn1_out;
1333   c.asn1_out = NULL;
1334   c.fasta_base = fasta_out;
1335   c.fasta_out = NULL;
1336   c.qual_base = qual_scores_out;
1337   c.qual_out = NULL;
1338   c.xml_base = xml_out;
1339   c.xml_out = NULL;
1340 
1341   /* XML values */
1342   c.subref = subref;
1343   c.center_name = center_name;
1344   c.taxid = taxid;
1345   c.description = description;
1346   c.assembly = assembly;
1347 
1348   file_count_list.list = NULL;
1349   file_count_list.current = NULL;
1350   file_count_list.max_bases = chunk_size;
1351 
1352   rbd.fp = OpenAceFile (acefile);
1353   if (rbd.fp == NULL) {
1354     WriteXMLMsgUnableToOpenFile (c.has_errors, acefile);
1355     goto escape;
1356   }
1357   rbd.current_data = NULL;
1358 
1359   ProcessLargeACEFileForContigFastaAndQualScores ( AbstractReadFunction, &rbd, 
1360                                                           qual_scores_out == NULL ? make_qual_scores : TRUE,
1361                                                           has_errors, ProcessContigCountCallback, &file_count_list);
1362 
1363   FileClose (rbd.fp);
1364   rbd.fp = NULL;
1365 
1366   /* prepare XML output */
1367   if (c.xml_base != NULL) {
1368     if (chunk_size < 1) {
1369       summ = SummarizeContigCountList (file_count_list.list);
1370       c.xml_out = FileOpen (c.xml_base, "w");
1371       if (c.xml_out == NULL) {
1372         WriteXMLMsgUnableToOpenFile (c.has_errors, c.xml_base);
1373         goto escape;
1374       }
1375       WriteTraceAssemblyHeader ("NEW", c.subref, c.center_name, c.taxid, c.description, c.assembly,
1376                                 summ->num_contigs, summ->num_conbases, summ->num_reads, summ->num_readbases,
1377                                 c.xml_out);
1378       summ = MemFree (summ);
1379       file_count_list.list = ValNodeFreeData (file_count_list.list);
1380     } else {
1381 #ifdef ONE_CONTIG_FOR_FIRST
1382       /* temporarily, start the first file instead, which will have just one contig */
1383       c.xml_out = FileOpen (c.xml_base, "w");
1384       if (c.xml_out == NULL) {
1385         WriteXMLMsgUnableToOpenFile (c.has_errors, c.xml_base);
1386         goto escape;
1387       }
1388       summ = (ContigCountCallbackPtr) file_count_list.list->data.ptrvalue;
1389       WriteTraceAssemblyHeader ("NEW", c.subref, c.center_name, c.taxid, c.description, c.assembly,
1390                                 summ->num_contigs, summ->num_conbases, summ->num_reads, summ->num_readbases,
1391                                 c.xml_out);
1392 #else
1393       f = FileOpen (c.xml_base, "w");
1394       if (f == NULL) {
1395         WriteXMLMsgUnableToOpenFile (c.has_errors, c.xml_base);
1396         goto escape;
1397       }
1398       WriteTraceAssemblyHeader ("NEW", c.subref, c.center_name, c.taxid, c.description, c.assembly,
1399                                 0, 0, 0, 0,
1400                                 f);
1401       WriteTraceAssemblyTrailer (f);
1402       FileClose (f);
1403 #endif
1404     }
1405   } else {
1406     if (chunk_size < 1) {
1407       file_count_list.list = ValNodeFreeData (file_count_list.list);
1408     }
1409   }
1410 
1411   c.file_counts_list = file_count_list.list;
1412 
1413   /* read template file */
1414   c.sbp = NULL;
1415   c.desc_list = NULL;
1416 
1417   if (!StringHasNoText (template_in)) {
1418     f = FileOpen (template_in, "r");
1419     if (f == NULL) {
1420       WriteXMLMsgUnableToOpenFile (c.has_errors, template_in);
1421       goto escape;
1422     }
1423     while ((dataptr = ReadAsnFastaOrFlatFile (f, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
1424       if (datatype == OBJ_SEQSUB) {
1425         ssp = (SeqSubmitPtr) dataptr;
1426         c.sbp = ssp->sub;
1427         ssp->sub = NULL;
1428         ssp = SeqSubmitFree (ssp);
1429       } else if (datatype == OBJ_SUBMIT_BLOCK) {
1430         c.sbp = (SubmitBlockPtr) dataptr;
1431       } else if (datatype == OBJ_SEQDESC) {
1432         sdp = (SeqDescrPtr) dataptr;
1433         if (sdp->choice == Seq_descr_source) {
1434           has_source = TRUE;
1435         }
1436         ValNodeLink (&(c.desc_list), (ValNodePtr) dataptr);
1437       } else {
1438         ObjMgrFree (datatype, dataptr);
1439       }
1440     }
1441     FileClose (f);
1442     if (c.sbp != NULL) {
1443       c.sbp->tool = MemFree (c.sbp->tool);
1444       c.sbp->tool = StringSave ("aceread");
1445       c.sbp->hup = FALSE;
1446       c.sbp->reldate = DateFree (c.sbp->reldate);
1447       csp = c.sbp->cit;
1448       if (csp != NULL) {
1449         csp->date = DateFree (csp->date);
1450         csp->date = DateCurr ();
1451       }
1452     }
1453   }
1454 
1455   if (taxid > 0 && !has_source) {
1456     /* tax lookup? */
1457     sdp = SeqDescrNew (NULL);
1458     sdp->choice = Seq_descr_source;
1459     sdp->data.ptrvalue = BioSourceDescriptorFromTaxId (taxid);
1460     ValNodeLink (&(c.desc_list), (ValNodePtr) sdp);
1461   }
1462 
1463   c.atp = AsnFind ("Bioseq-set.seq-set.E");
1464 
1465   c.recalculate_consensus = recalculate_consensus;
1466   c.recalculate_only_Ns = recalculate_only_Ns;
1467 
1468   if (id_lookup != NULL) {
1469     f = FileOpen (id_lookup, "r");
1470     if (f == NULL) {
1471       WriteXMLMsgUnableToOpenFile (c.has_errors, id_lookup);
1472       goto escape;
1473     }
1474     c.id_replacement_list = ReadSeqIdPairListFromFile (f);
1475     SeqEntrySetScope (old_scope);
1476     FileClose (f);
1477   }
1478 
1479   if (chunk_size < 1) {
1480     if (c.asn_base != NULL) {
1481       c.asn1_out = StartAsnFile (c.asn_base, c.sbp, c.desc_list);
1482       if (c.asn1_out == NULL) {
1483         WriteXMLMsgUnableToOpenFile (c.has_errors, c.asn_base);
1484         goto escape;
1485       }
1486     }
1487     if (c.fasta_base != NULL) {
1488       c.fasta_out = FileOpen (c.fasta_base, "w");
1489       if (c.fasta_out == NULL) {
1490         WriteXMLMsgUnableToOpenFile (c.has_errors, c.fasta_base);
1491         goto escape;
1492       }
1493     }
1494     if (c.qual_out != NULL) {
1495       c.qual_out = FileOpen (c.qual_base, "w");
1496       if (c.qual_out == NULL) {
1497         WriteXMLMsgUnableToOpenFile (c.has_errors, c.qual_base);
1498         goto escape;
1499       }
1500     }
1501   }
1502   
1503   rbd.fp = OpenAceFile (acefile);
1504   if (rbd.fp == NULL) {
1505     WriteXMLMsgUnableToOpenFile (c.has_errors, acefile);
1506     goto escape;
1507   }
1508   rbd.current_data = NULL;
1509 
1510   ProcessLargeACEFileForContigFastaAndQualScores ( AbstractReadFunction, &rbd, 
1511                                                           qual_scores_out == NULL ? FALSE : TRUE,
1512                                                           has_errors, ProcessContigCallback, &c);
1513 
1514 
1515 escape:
1516   FileClose (rbd.fp);
1517   c.id_replacement_list = SeqIdReplaceListFree (c.id_replacement_list);
1518   /* free c.desc_list */
1519   for (sdp = c.desc_list; sdp != NULL; sdp = sdp_next) {
1520     sdp_next = sdp->next;
1521     sdp->next = NULL;
1522     sdp = SeqDescrFree (sdp);
1523   }
1524   if (c.xml_out != NULL) {
1525     WriteTraceAssemblyTrailer (c.xml_out);
1526     FileClose (c.xml_out);
1527     c.xml_out = NULL;
1528   }
1529   if (c.asn1_out != NULL) {
1530     c.asn1_out = EndAsnFile (c.asn1_out, c.sbp != NULL);
1531   }
1532   if (c.fasta_out != NULL) {
1533     FileClose (c.fasta_out);
1534     c.fasta_out = NULL;
1535   }
1536   if (c.qual_out != NULL) {
1537     FileClose (c.qual_out);
1538     c.qual_out = NULL;
1539   }
1540   c.sbp = SubmitBlockFree (c.sbp);
1541 }
1542 
1543 
1544 Int2 Main (void)
1545 
1546 {
1547   CharPtr      infile, outfile, xmlfile;
1548 
1549   ReadBufferData    rbd;
1550   TACEFilePtr afp;
1551   Int4        i, len;
1552   SeqEntryPtr sep;
1553   AsnIoPtr    aip;
1554   FILE *f = NULL;
1555   FILE *f2;
1556   CharPtr app = "aceread_tst";
1557   BioseqSetPtr bssp;
1558   SeqEntryPtr  last_sep = NULL;
1559   Uint2        entityID;
1560   Boolean      make_qual_scores, suppress_lookup, srr_ids, fasta_out;
1561   CharPtr      submitter_ref = NULL, archive_id = NULL, description = NULL, assembly = NULL;
1562   CharPtr      center_name = NULL;
1563   CharPtr      format = NULL;
1564   CharPtr      gap_string;
1565   CharPtr      asn_file = NULL;
1566   Int4         limit = 0;
1567   char         has_errors = 0;
1568   Boolean      recalculate_consensus = FALSE, recalculate_only_Ns = FALSE;
1569   CharPtr      recalculate_options;
1570   SeqSubmitPtr ssp;
1571   CharPtr      id_substitution_file = NULL;
1572   Int4         taxon_id = 0;
1573 
1574   /* standard setup */
1575 
1576   ErrSetFatalLevel (SEV_MAX);
1577   ErrSetMessageLevel (SEV_MAX);
1578   ErrClearOptFlags (EO_SHOW_USERSTR);
1579   ErrSetLogfile ("stderr", ELOG_APPEND);
1580   ErrSetOpts (ERR_IGNORE, ERR_LOG_ON);
1581 
1582   UseLocalAsnloadDataAndErrMsg ();
1583   ErrPathReset ();
1584 
1585   if (! AllObjLoad ()) {
1586     Message (MSG_FATAL, "AllObjLoad failed");
1587     return 1;
1588   }
1589   if (! SubmitAsnLoad ()) {
1590     Message (MSG_FATAL, "SubmitAsnLoad failed");
1591     return 1;
1592   }
1593   if (! FeatDefSetLoad ()) {
1594     Message (MSG_FATAL, "FeatDefSetLoad failed");
1595     return 1;
1596   }
1597   PubSeqFetchEnable ();
1598 
1599   if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
1600     return 0;
1601   }
1602 
1603   recalculate_options = (CharPtr) myargs[N_argRecalculateConsensus].strvalue;
1604   if (!StringHasNoText (recalculate_options)) {
1605     if (StringCmp (recalculate_options, "W") == 0) {
1606       recalculate_consensus = TRUE;
1607       recalculate_only_Ns = FALSE;
1608     } else if (StringCmp (recalculate_options, "N") == 0) {
1609       recalculate_consensus = TRUE;
1610       recalculate_only_Ns = TRUE;
1611     } else {
1612       Message (MSG_FATAL, "Invalid consensus sequence recalculation option");
1613       return 1;
1614     }
1615   }
1616 
1617 
1618   /* test gap info reading if provided */
1619   gap_string = (CharPtr) myargs[G_argGapString].strvalue;
1620   TestGapInfoReading (gap_string);
1621 
1622   /* limit number of contigs?  for debugging purposes */
1623   limit = myargs[l_argLimitNumContigs].intvalue;
1624 
1625   /* select format of input file */
1626   format = (CharPtr) myargs[F_argFormat].strvalue;
1627   if (StringHasNoText (format)) {
1628     format = "A";
1629   }
1630 
1631   infile = (CharPtr) myargs [i_argInputFile].strvalue;
1632   if (StringHasNoText (infile)) {
1633     Message (MSG_FATAL, "Must supply input file!");
1634     return 1;
1635   }
1636   outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
1637   xmlfile = (CharPtr) myargs[X_argXMLFile].strvalue;
1638   make_qual_scores = (Boolean) myargs [Q_argMakeQualScores].intvalue;
1639   center_name = (CharPtr) myargs[C_argCenter].strvalue;
1640   suppress_lookup = (Boolean) myargs [L_argSuppressIdLookup].intvalue;
1641   srr_ids = (Boolean) myargs[R_argSRRids].intvalue;
1642   fasta_out = (Boolean) myargs[f_argFASTA].intvalue;
1643 
1644   /* ASN.1 file to validate against */
1645   asn_file = (CharPtr) myargs [V_argValidateAgainstAsn1File].strvalue;
1646 
1647   if (!GetTSAFieldsFromString ((CharPtr) myargs [T_argTSAFields].strvalue,
1648                                &submitter_ref,
1649                                &archive_id,
1650                                &description,
1651                                &assembly,
1652                                &taxon_id)) {
1653     Message (MSG_FATAL, "Error reading TSA fields");
1654     return 1;
1655   }
1656 
1657   if (!StringHasNoText (xmlfile) && (StringHasNoText (center_name) || taxon_id < 1)) {
1658     PrintACEFormatErrorXML ("Must specify center name and taxid for XML output", NULL, &has_errors);
1659     printf ("</aceread>\n");
1660     return 1;
1661   }        
1662 
1663   len = StringLen (infile);
1664   if (StringHasNoText (outfile)) {
1665     if (len > 3 && StringCmp (infile + len - 4, ".ace") == 0) {
1666       outfile = StringSave (infile);
1667       StringCpy (outfile + len - 3, "sqn");
1668     } else if (len > 6 && StringCmp (infile + len - 7, ".ace.gz") == 0) {
1669       outfile = StringSave (infile);
1670       StringCpy (outfile + len - 6, "sqn");
1671     } else {
1672       outfile = (CharPtr) MemNew (sizeof (Char) * (len + 5));
1673       sprintf (outfile, "%s.sqn", infile);
1674     }
1675   }
1676 
1677   if (!StringHasNoText ((CharPtr) myargs [S_argIDSubstitutionFile].strvalue)) {
1678     id_substitution_file = ((CharPtr) myargs [S_argIDSubstitutionFile].strvalue);
1679   }
1680 
1681   if (StringChr (format, 'A') != NULL) {
1682     ReadLargeAceFile (infile, fasta_out ? NULL : outfile,
1683                       fasta_out ? outfile : NULL, 
1684                       (CharPtr) myargs[t_argTemplateFile].strvalue,
1685                       NULL, xmlfile, id_substitution_file, &has_errors,
1686                       recalculate_consensus, recalculate_only_Ns,
1687                       submitter_ref, center_name, taxon_id, description, assembly, 
1688                       suppress_lookup, srr_ids, make_qual_scores,
1689                       myargs[c_argChunkSize].intvalue,
1690                       ReadNameTypeFromArg (myargs[n_argReadNameType].strvalue),
1691                       (Boolean) myargs [z_argIncludeReads].intvalue);
1692     if (has_errors) {
1693       printf ("</aceread>\n");
1694       return 1;
1695     } else {
1696       return 0;
1697     }
1698   }
1699 
1700   if (id_substitution_file != NULL) {
1701     f = FileOpen (id_substitution_file, "r");
1702     if (f == NULL) {
1703       Message (MSG_FATAL, "Unable to open %s", id_substitution_file);
1704       return 1;
1705     }
1706   }
1707 
1708   if (StringChr (format, 'M') != NULL) {
1709     rbd.fp = FileOpen (infile, "r");
1710     if (rbd.fp == NULL) {
1711       Message (MSG_FATAL, "Unable to open %s", infile);
1712       return 1;
1713     }
1714 
1715     rbd.current_data = NULL;
1716     afp = ReadMAQFile (AbstractReadFunction, &rbd);
1717   } else if (StringChr (format, 'E') != NULL) {
1718     rbd.fp = FileOpen (infile, "r");
1719     if (rbd.fp == NULL) {
1720       Message (MSG_FATAL, "Unable to open %s", infile);
1721       return 1;
1722     }
1723 
1724     rbd.current_data = NULL;
1725     afp = ReadElandStandaloneFile (AbstractReadFunction, &rbd);
1726   } else if (StringChr (format, 'A') != NULL) { 
1727     rbd.fp = OpenAceFile (infile);
1728     if (rbd.fp == NULL) {
1729       Message (MSG_FATAL, "Unable to open %s", infile);
1730       return 1;
1731     }
1732     rbd.current_data = NULL;
1733     afp = ReadACEFile ( AbstractReadFunction, &rbd, make_qual_scores, &has_errors);
1734   } else {
1735     Message (MSG_FATAL, "Unrecognized format: %s\n", format);
1736     return 1;
1737   }
1738   FileClose (rbd.fp);
1739   if (afp == NULL) {
1740     printf ("<message severity=\"ERROR\" seq-id=\"No ID\" code=\"bad_format\">Unable to read file</message>\n");
1741   } else {
1742     if (recalculate_consensus) {
1743         if (!AddReadQualityScores (afp, (CharPtr) myargs [q_argReadQualScoresFile].strvalue, (CharPtr) myargs [r_argReadFASTAFile].strvalue)) {
1744             printf ("<message severity=\"ERROR\" seq-id=\"No ID\" code=\"bad_format\">Failed to add read quality scores</message>\n");
1745         } else {
1746             RecalculateConsensusSequences (afp, recalculate_only_Ns);
1747         }
1748     }
1749 
1750     if (limit > 0) {
1751       for (i = limit; i < afp->num_contigs; i++) {
1752         ContigFree (afp->contigs[i]);
1753         afp->contigs[i] = NULL;
1754       }
1755       afp->num_contigs = limit;
1756     }
1757 
1758     if (f != NULL) {
1759       UpdateAceFileIds (afp, f, suppress_lookup, srr_ids, &has_errors);
1760       FileClose (f);
1761       f = NULL;
1762     }
1763     ValidateAceFileIds (afp, &has_errors);
1764 
1765     if (asn_file != NULL) {
1766       if (ValidateAgainstASNFile (afp, asn_file, &has_errors)) {
1767         printf ("Validation against %s succeeded\n", asn_file);
1768       }
1769     }
1770 
1771     if (!StringHasNoText (xmlfile)) {
1772       f2 = FileOpen (xmlfile, "w");
1773       WriteTraceAssemblyFromAceFile (afp, submitter_ref, center_name, 0, description, f2);
1774       FileClose (f2);
1775     }
1776 
1777     if (fasta_out) {
1778       f2 = FileOpen (outfile, "w");
1779       WriteFASTAFromAceFile (afp, f2);
1780       FileClose (f2);
1781     } else {
1782       aip = AsnIoOpen (outfile, "w");
1783       if (aip == NULL) {
1784         printf ("Unable to open %s\n", outfile);
1785       } else {
1786         bssp = BioseqSetNew ();
1787         bssp->_class = BioseqseqSet_class_genbank;
1788 
1789         for (i = 0; i < afp->num_contigs; i++) {
1790           sep = MakeSeqEntryFromContig (afp->contigs[i]);
1791           if (last_sep == NULL) {
1792             bssp->seq_set = sep;
1793           } else {
1794             last_sep->next = sep;
1795           }
1796           last_sep = sep;          
1797         }
1798         sep = ValNodeNew (NULL);
1799         sep->choice = 2;
1800         sep->data.ptrvalue = bssp;
1801         bssp->seqentry = sep;
1802         SeqMgrLinkSeqEntry (sep, 0, NULL);
1803         entityID = ObjMgrGetEntityIDForChoice (sep);
1804         AssignIDsInEntityEx (entityID, 0, NULL, NULL);
1805         SeqMgrIndexFeatures (entityID, sep);
1806         ssp = AddSeqSubmitFromTemplate (sep, (CharPtr) myargs[t_argTemplateFile].strvalue);
1807         if (ssp == NULL) {
1808           SeqEntryAsnWrite (sep, aip, NULL);
1809           sep = SeqEntryFree (sep);
1810         } else {
1811           SeqSubmitAsnWrite (ssp, aip, NULL);
1812           ssp = SeqSubmitFree (ssp);
1813         }
1814         AsnIoClose (aip);
1815       }
1816     }
1817   }
1818 
1819   if (has_errors) {
1820     printf ("</aceread>\n");
1821   }
1822 
1823   return 0;
1824 
1825 }
1826 
1827 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.