NCBI C Toolkit Cross Reference

C/demo/bulk2htgs.c


  1 /*   bulk2htgs.c
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *            National Center for Biotechnology Information (NCBI)
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's official duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government do not place any restriction on its use or reproduction.
 13 *  We would, however, appreciate having the NCBI and the author cited in
 14 *  any work or product based on this material
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 * ===========================================================================
 25 *
 26 * File Name:  bulk2htgs.c
 27 *
 28 * Author:  Jonathan Kans
 29 *
 30 * Version Creation Date:   11/2/99
 31 *
 32 * $Revision: 6.6 $
 33 *
 34 * File Description: 
 35 *
 36 * Modifications:  
 37 * --------------------------------------------------------------------------
 38 * Date     Name        Description of modification
 39 * -------  ----------  -----------------------------------------------------
 40 *
 41 *
 42 * ==========================================================================
 43 */
 44 
 45 #include <ncbi.h>
 46 #include <objall.h>
 47 #include <objsset.h>
 48 #include <objsub.h>
 49 #include <sequtil.h>
 50 #include <sqnutils.h>
 51 #include <subutil.h>
 52 
 53 static void AddBioSourceToBioseq (BioseqPtr bsp, CharPtr organism, BioSourcePtr bio)
 54 
 55 {
 56   BioSourcePtr  biop;
 57   OrgRefPtr     orp;
 58   SeqDescrPtr   sdp;
 59 
 60   if (bsp == NULL) return;
 61 
 62   if (bio != NULL) {
 63     biop = AsnIoMemCopy ((Pointer) bio,
 64                          (AsnReadFunc) BioSourceAsnRead,
 65                          (AsnWriteFunc) BioSourceAsnWrite);
 66   } else {
 67     if (StringHasNoText (organism)) return;
 68     biop = BioSourceNew ();
 69     if (biop == NULL) return;
 70     orp = OrgRefNew ();
 71     if (orp == NULL) return;
 72     biop->org = orp;
 73     orp->taxname = StringSave (organism);
 74   }
 75 
 76   sdp = SeqDescrAdd (&(bsp->descr));
 77   if (sdp == NULL) return;
 78   sdp->choice = Seq_descr_source;
 79   sdp->data.ptrvalue = (Pointer) biop;
 80 }
 81 
 82 static void AddMolInfoToBioseq (BioseqPtr bsp, Boolean is_mrna, Int2 htgs_phase)
 83 
 84 {
 85   MolInfoPtr   mip;
 86   SeqDescrPtr  sdp;
 87 
 88   if (bsp == NULL) return;
 89 
 90   mip = MolInfoNew ();
 91   if (mip == NULL) return;
 92   if (is_mrna) {
 93     mip->biomol = MOLECULE_TYPE_MRNA;
 94   } else {
 95     mip->biomol = MOLECULE_TYPE_GENOMIC;
 96   }
 97   switch (htgs_phase) {
 98     case 0 :
 99       mip->tech = MI_TECH_htgs_0;
100       break;
101     case 1 :
102       mip->tech = MI_TECH_htgs_1;
103       break;
104     case 2 :
105       mip->tech = MI_TECH_htgs_2;
106       break;
107     case 3 :
108       mip->tech = MI_TECH_htgs_3;
109       break;
110     default :
111       break;
112   }
113 
114   sdp = SeqDescrAdd (&(bsp->descr));
115   if (sdp == NULL) return;
116   sdp->choice = Seq_descr_molinfo;
117   sdp->data.ptrvalue = (Pointer) mip;
118 }
119 
120 static void ConvertSeqID (BioseqPtr bsp, CharPtr general,
121                           Boolean parse_colon, Boolean id_comment)
122 
123 {
124   Char         ch;
125   CharPtr      db, id, ptr;
126   DbtagPtr     dbt;
127   Char         idcom [128], tmp [128];
128   Boolean      justdigits;
129   ObjectIdPtr  oip;
130   SeqDescrPtr  sdp;
131   SeqIdPtr     sip = NULL;
132   long int     val;
133 
134   if (bsp == NULL) return;
135 
136   for (sip = bsp->id;
137        sip != NULL && sip->choice != SEQID_LOCAL;
138        sip = sip->next) continue;
139   if (sip == NULL) return;
140 
141   oip = (ObjectIdPtr) sip->data.ptrvalue;
142   if (oip == NULL) return;
143   if (oip->str != NULL) {
144     StringNCpy_0 (tmp, oip->str, sizeof (tmp));
145   } else {
146     sprintf (tmp, "%ld", (long) oip->id);
147   }
148 
149   /* if colon in localid, parse db and id separately */
150 
151   ptr = StringChr (tmp, ':');
152   if (parse_colon && ptr != NULL) {
153     db = tmp;
154     *ptr = '\0';
155     ptr++;
156     id = ptr;
157   } else {
158     db = NULL;
159     id = tmp;
160   }
161 
162   /* ignore db in localid if general tag passed in */
163 
164   if (! StringHasNoText (general)) {
165     db = general;
166   }
167 
168   if (StringHasNoText (db) || StringHasNoText (id)) return;
169   dbt = DbtagNew ();
170   if (dbt == NULL) return;
171 
172   /* insert dbtag between seqid and objectid, change choice and objectid */
173 
174   sip->choice = SEQID_GENERAL;
175   sip->data.ptrvalue = (Pointer) dbt;
176   dbt->db = StringSave (db);
177   dbt->tag = oip;
178   oip->str = MemFree (oip->str);
179 
180   for (justdigits = TRUE, ptr = id, ch = *ptr;
181        ch != '\0';
182        ptr++, ch = *ptr) {
183     if (ch == ' ' || ch == '+' || ch == '-') {
184     } else if (! IS_DIGIT (ch)) {
185       justdigits = FALSE;
186     }
187   }
188 
189   if (justdigits && sscanf (id, "%ld", &val) == 1) {
190     oip->id = (Int4) val;
191   } else {
192     oip->str = StringSave (id);
193   }
194 
195   if (id_comment) {
196     if (oip->str != NULL) {
197       StringNCpy_0 (tmp, oip->str, sizeof (tmp));
198     } else {
199       sprintf (tmp, "%ld", oip->id);
200     }
201     sprintf (idcom, "This sequence was identified as %s by the submitter", tmp);
202     sdp = SeqDescrAdd (&(bsp->descr));
203     if (sdp != NULL) {
204       sdp->choice = Seq_descr_comment;
205       sdp->data.ptrvalue = (Pointer) StringSave (idcom);
206     }
207   }
208 
209 
210   SeqMgrReplaceInBioseqIndex (bsp);
211 }
212 
213 static void ProcessOneRecord (SeqSubmitPtr ssp, CharPtr organism,
214                               BioSourcePtr biop, CharPtr general,
215                               FILE* ofp, Boolean is_mrna,
216                               Int2 htgs_phase, Boolean parse_colon,
217                               Boolean id_comment, CharPtr comment,
218                               Uint2 datatype, Pointer dataptr)
219 
220 {
221   AsnIoPtr     aip;
222   BioseqPtr    bsp;
223   Int4         pos;
224   SeqDescrPtr  sdp;
225   SeqEntryPtr  sep;
226 
227   if (ssp == NULL || ofp == NULL) return;
228   if (organism == NULL && biop == NULL) return;
229   if (datatype != OBJ_BIOSEQ) return;
230 
231   bsp = (BioseqPtr) dataptr;
232   if (bsp == NULL) return;
233   sep = SeqMgrGetSeqEntryForData (bsp);
234   if (sep == NULL) return;
235 
236   AddBioSourceToBioseq (bsp, organism, biop);
237   AddMolInfoToBioseq (bsp, is_mrna, htgs_phase);
238   if (is_mrna) {
239     bsp->mol = Seq_mol_rna;
240   } else {
241     bsp->mol = Seq_mol_dna;
242   }
243   ConvertSeqID (bsp, general, parse_colon, id_comment);
244   if (! StringHasNoText (comment)) {
245     sdp = SeqDescrAdd (&(bsp->descr));
246     if (sdp != NULL) {
247       sdp->choice = Seq_descr_comment;
248       sdp->data.ptrvalue = (Pointer) StringSave (comment);
249     }
250   }
251   sdp = SeqDescrAdd (&(bsp->descr));
252   if (sdp != NULL) {
253     sdp->choice = Seq_descr_create_date;
254     sdp->data.ptrvalue = (Pointer) DateCurr ();
255   }
256 
257   ssp->data = sep;
258   ssp->datatype = 1;
259 
260   aip = AsnIoNew (ASNIO_TEXT_OUT, ofp, NULL, NULL, NULL);
261 
262   SeqSubmitAsnWrite (ssp, aip, NULL);
263 
264   pos = AsnIoTell (aip);
265   AsnIoFree (aip, FALSE);
266   fseek (ofp, pos, SEEK_SET);
267   fprintf (ofp, "\n");
268 
269   ssp->data = NULL;
270   SeqEntryFree (sep);
271 }
272 
273 static BioSourcePtr ReadBioSource (CharPtr path)
274 
275 {
276   AsnIoPtr      aip;
277   BioSourcePtr  biop = NULL;
278 
279   aip = AsnIoOpen (path, "r");
280   if (aip == NULL) return NULL;
281 
282   biop = BioSourceAsnRead (aip, NULL);
283 
284   AsnIoClose (aip);
285 
286   return biop;
287 }
288 
289 /* template file can contain either Seq-submit or Submit-block */
290 
291 static SeqSubmitPtr ReadSubmitBlock (CharPtr path)
292 
293 {
294   CitSubPtr       csp;
295   Pointer         dataptr;
296   Uint2           datatype;
297   FILE            *fp;
298   SubmitBlockPtr  sbp = NULL;
299   SeqSubmitPtr    ssp = NULL;
300 
301   fp = FileOpen (path, "r");
302   if (fp == NULL) return NULL;
303 
304   dataptr = ReadAsnFastaOrFlatFile (fp, &datatype,  NULL, FALSE,
305                                     FALSE, TRUE, FALSE);
306   FileClose (fp);
307 
308   switch (datatype) {
309     case OBJ_SUBMIT_BLOCK :
310       sbp = (SubmitBlockPtr) dataptr;
311       ssp = SeqSubmitNew ();
312       if (ssp != NULL) {
313         ssp->sub = sbp;
314       }
315       break;
316     case OBJ_SEQSUB :
317       ssp = (SeqSubmitPtr) dataptr;
318       if (ssp != NULL) {
319         sbp = ssp->sub;
320       }
321       break;
322     default :
323       break;
324   }
325 
326   if (sbp != NULL) {
327     csp = sbp->cit;
328     if (csp != NULL) {
329       csp->date = DateFree (csp->date);
330       csp->date = DateCurr ();
331     }
332   }
333 
334   return ssp;
335 }
336 
337 Args myargs [] = {
338   {"Filename for FASTA input", "stdin", NULL, NULL,
339     FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
340   {"Filename for Seq-submit template", NULL, NULL, NULL,
341     FALSE, 't', ARG_FILE_IN, 0.0, 0, NULL},
342   {"Filename for ASN.1 output", "stdout", NULL, NULL,
343     FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
344   {"Organism name", NULL, NULL, NULL,
345     TRUE, 'n', ARG_STRING, 0.0, 0, NULL},
346   {"General ID tag", NULL, NULL, NULL,
347     TRUE, 'g', ARG_STRING, 0.0, 0, NULL},
348   {"Sequences are mRNA?", "F", NULL, NULL,
349     TRUE, 'm', ARG_BOOLEAN, 0.0, 0, NULL},
350   {"HTGS phase?", "1", "0" ,"3",
351     FALSE, 'p', ARG_INT, 0.0, 0, NULL},
352   {"Parse colon in tag", "F", NULL, NULL,
353     TRUE, 'd', ARG_BOOLEAN, 0.0, 0, NULL},
354   {"Comment", NULL, NULL, NULL,
355     TRUE, 'c', ARG_STRING, 0.0, 0, NULL},
356   {"Filename for BioSource", NULL, NULL, NULL,
357     TRUE, 'b', ARG_FILE_IN, 0.0, 0, NULL},
358   {"Make identifier comment", "F", NULL, NULL,
359     TRUE, 'f', ARG_BOOLEAN, 0.0, 0, NULL},
360 };
361 
362 Int2 Main (void)
363 
364 {
365   BioSourcePtr  biop = NULL;
366   Pointer       dataptr;
367   Uint2         datatype;
368   CharPtr       fasta_fname, template_fname, output_fname,
369                 organism, generalid, comment, biosource_fname;
370   Int2          htgs_phase;
371   FILE          *ifp, *ofp;
372   Boolean       id_comment, is_mrna, parse_colon;
373   SeqSubmitPtr  ssp;
374 
375   ErrSetFatalLevel (SEV_FATAL);
376   ErrClearOptFlags (EO_SHOW_USERSTR);
377   UseLocalAsnloadDataAndErrMsg ();
378   ErrPathReset ();
379 
380   if (! AllObjLoad ()) {
381     Message (MSG_FATAL, "AllObjLoad failed");
382     return 1;
383   }
384   if (! SubmitAsnLoad ()) {
385     Message (MSG_FATAL, "SubmitAsnLoad failed");
386     return 1;
387   }
388   if (! SeqCodeSetLoad ()) {
389     Message (MSG_FATAL, "SeqCodeSetLoad failed");
390     return 1;
391   }
392   if (! GeneticCodeTableLoad ()) {
393     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
394     return 1;
395   }
396 
397   if (! GetArgs ("bulk2htgs", sizeof (myargs) / sizeof (Args), myargs)) {
398     return 0;
399   }
400 
401   fasta_fname = myargs [0].strvalue;
402   template_fname = myargs [1].strvalue;
403   output_fname = myargs [2].strvalue;
404   organism = myargs [3].strvalue;
405   generalid = myargs [4].strvalue;
406   is_mrna = (Boolean) myargs [5].intvalue;
407   htgs_phase = (Int2) myargs [6].intvalue;
408   parse_colon = (Boolean) myargs [7].intvalue;
409   comment = myargs [8].strvalue;
410   biosource_fname = myargs [9].strvalue;
411   id_comment = (Boolean) myargs [10].intvalue;
412 
413   if (StringHasNoText (output_fname)) {
414     Message (MSG_FATAL, "Unable to open output file");
415     return 1;
416   }
417 
418   ssp = ReadSubmitBlock (template_fname);
419   if (ssp == NULL) {
420     Message (MSG_FATAL, "Unable to read template file");
421     return 1;
422   }
423   ssp->datatype = 1;
424 
425   if (! StringHasNoText (biosource_fname)) {
426     biop = ReadBioSource (biosource_fname);
427     if (biop == NULL) {
428       Message (MSG_FATAL, "Unable to read BioSource file");
429       return 1;
430     }
431   }
432   if (biop == NULL && StringHasNoText (organism)) {
433     Message (MSG_FATAL, "Organism name or BioSource file is required for processing");
434     return 1;
435   }
436 
437 
438   ifp = FileOpen (fasta_fname, "r");
439   if (ifp == NULL) {
440     Message (MSG_FATAL, "Unable to open input file");
441     return 1;
442   }
443 
444   ofp = FileOpen (output_fname, "w");
445   if (ofp == NULL) {
446     Message (MSG_FATAL, "Unable to create output file");
447     return 1;
448   }
449 
450   while ((dataptr = ReadAsnFastaOrFlatFile (ifp, &datatype, NULL, FALSE,
451                                             FALSE, TRUE, FALSE)) != NULL) {
452     ProcessOneRecord (ssp, organism, biop, generalid, ofp,
453                       is_mrna, htgs_phase, parse_colon, id_comment,
454                       comment, datatype, dataptr);
455   }
456 
457   FileClose (ofp);
458   FileClose (ifp);
459 
460   BioSourceFree (biop);
461   SeqSubmitFree (ssp);
462 
463   return 0;
464 }
465 
466 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.