|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/demo/bulk2htgs.c |
source navigation diff markup identifier search freetext search file search |
1 /* bulk2htgs.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: bulk2htgs.c
27 *
28 * Author: Jonathan Kans
29 *
30 * Version Creation Date: 11/2/99
31 *
32 * $Revision: 6.6 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45 #include <ncbi.h>
46 #include <objall.h>
47 #include <objsset.h>
48 #include <objsub.h>
49 #include <sequtil.h>
50 #include <sqnutils.h>
51 #include <subutil.h>
52
53 static void AddBioSourceToBioseq (BioseqPtr bsp, CharPtr organism, BioSourcePtr bio)
54
55 {
56 BioSourcePtr biop;
57 OrgRefPtr orp;
58 SeqDescrPtr sdp;
59
60 if (bsp == NULL) return;
61
62 if (bio != NULL) {
63 biop = AsnIoMemCopy ((Pointer) bio,
64 (AsnReadFunc) BioSourceAsnRead,
65 (AsnWriteFunc) BioSourceAsnWrite);
66 } else {
67 if (StringHasNoText (organism)) return;
68 biop = BioSourceNew ();
69 if (biop == NULL) return;
70 orp = OrgRefNew ();
71 if (orp == NULL) return;
72 biop->org = orp;
73 orp->taxname = StringSave (organism);
74 }
75
76 sdp = SeqDescrAdd (&(bsp->descr));
77 if (sdp == NULL) return;
78 sdp->choice = Seq_descr_source;
79 sdp->data.ptrvalue = (Pointer) biop;
80 }
81
82 static void AddMolInfoToBioseq (BioseqPtr bsp, Boolean is_mrna, Int2 htgs_phase)
83
84 {
85 MolInfoPtr mip;
86 SeqDescrPtr sdp;
87
88 if (bsp == NULL) return;
89
90 mip = MolInfoNew ();
91 if (mip == NULL) return;
92 if (is_mrna) {
93 mip->biomol = MOLECULE_TYPE_MRNA;
94 } else {
95 mip->biomol = MOLECULE_TYPE_GENOMIC;
96 }
97 switch (htgs_phase) {
98 case 0 :
99 mip->tech = MI_TECH_htgs_0;
100 break;
101 case 1 :
102 mip->tech = MI_TECH_htgs_1;
103 break;
104 case 2 :
105 mip->tech = MI_TECH_htgs_2;
106 break;
107 case 3 :
108 mip->tech = MI_TECH_htgs_3;
109 break;
110 default :
111 break;
112 }
113
114 sdp = SeqDescrAdd (&(bsp->descr));
115 if (sdp == NULL) return;
116 sdp->choice = Seq_descr_molinfo;
117 sdp->data.ptrvalue = (Pointer) mip;
118 }
119
120 static void ConvertSeqID (BioseqPtr bsp, CharPtr general,
121 Boolean parse_colon, Boolean id_comment)
122
123 {
124 Char ch;
125 CharPtr db, id, ptr;
126 DbtagPtr dbt;
127 Char idcom [128], tmp [128];
128 Boolean justdigits;
129 ObjectIdPtr oip;
130 SeqDescrPtr sdp;
131 SeqIdPtr sip = NULL;
132 long int val;
133
134 if (bsp == NULL) return;
135
136 for (sip = bsp->id;
137 sip != NULL && sip->choice != SEQID_LOCAL;
138 sip = sip->next) continue;
139 if (sip == NULL) return;
140
141 oip = (ObjectIdPtr) sip->data.ptrvalue;
142 if (oip == NULL) return;
143 if (oip->str != NULL) {
144 StringNCpy_0 (tmp, oip->str, sizeof (tmp));
145 } else {
146 sprintf (tmp, "%ld", (long) oip->id);
147 }
148
149 /* if colon in localid, parse db and id separately */
150
151 ptr = StringChr (tmp, ':');
152 if (parse_colon && ptr != NULL) {
153 db = tmp;
154 *ptr = '\0';
155 ptr++;
156 id = ptr;
157 } else {
158 db = NULL;
159 id = tmp;
160 }
161
162 /* ignore db in localid if general tag passed in */
163
164 if (! StringHasNoText (general)) {
165 db = general;
166 }
167
168 if (StringHasNoText (db) || StringHasNoText (id)) return;
169 dbt = DbtagNew ();
170 if (dbt == NULL) return;
171
172 /* insert dbtag between seqid and objectid, change choice and objectid */
173
174 sip->choice = SEQID_GENERAL;
175 sip->data.ptrvalue = (Pointer) dbt;
176 dbt->db = StringSave (db);
177 dbt->tag = oip;
178 oip->str = MemFree (oip->str);
179
180 for (justdigits = TRUE, ptr = id, ch = *ptr;
181 ch != '\0';
182 ptr++, ch = *ptr) {
183 if (ch == ' ' || ch == '+' || ch == '-') {
184 } else if (! IS_DIGIT (ch)) {
185 justdigits = FALSE;
186 }
187 }
188
189 if (justdigits && sscanf (id, "%ld", &val) == 1) {
190 oip->id = (Int4) val;
191 } else {
192 oip->str = StringSave (id);
193 }
194
195 if (id_comment) {
196 if (oip->str != NULL) {
197 StringNCpy_0 (tmp, oip->str, sizeof (tmp));
198 } else {
199 sprintf (tmp, "%ld", oip->id);
200 }
201 sprintf (idcom, "This sequence was identified as %s by the submitter", tmp);
202 sdp = SeqDescrAdd (&(bsp->descr));
203 if (sdp != NULL) {
204 sdp->choice = Seq_descr_comment;
205 sdp->data.ptrvalue = (Pointer) StringSave (idcom);
206 }
207 }
208
209
210 SeqMgrReplaceInBioseqIndex (bsp);
211 }
212
213 static void ProcessOneRecord (SeqSubmitPtr ssp, CharPtr organism,
214 BioSourcePtr biop, CharPtr general,
215 FILE* ofp, Boolean is_mrna,
216 Int2 htgs_phase, Boolean parse_colon,
217 Boolean id_comment, CharPtr comment,
218 Uint2 datatype, Pointer dataptr)
219
220 {
221 AsnIoPtr aip;
222 BioseqPtr bsp;
223 Int4 pos;
224 SeqDescrPtr sdp;
225 SeqEntryPtr sep;
226
227 if (ssp == NULL || ofp == NULL) return;
228 if (organism == NULL && biop == NULL) return;
229 if (datatype != OBJ_BIOSEQ) return;
230
231 bsp = (BioseqPtr) dataptr;
232 if (bsp == NULL) return;
233 sep = SeqMgrGetSeqEntryForData (bsp);
234 if (sep == NULL) return;
235
236 AddBioSourceToBioseq (bsp, organism, biop);
237 AddMolInfoToBioseq (bsp, is_mrna, htgs_phase);
238 if (is_mrna) {
239 bsp->mol = Seq_mol_rna;
240 } else {
241 bsp->mol = Seq_mol_dna;
242 }
243 ConvertSeqID (bsp, general, parse_colon, id_comment);
244 if (! StringHasNoText (comment)) {
245 sdp = SeqDescrAdd (&(bsp->descr));
246 if (sdp != NULL) {
247 sdp->choice = Seq_descr_comment;
248 sdp->data.ptrvalue = (Pointer) StringSave (comment);
249 }
250 }
251 sdp = SeqDescrAdd (&(bsp->descr));
252 if (sdp != NULL) {
253 sdp->choice = Seq_descr_create_date;
254 sdp->data.ptrvalue = (Pointer) DateCurr ();
255 }
256
257 ssp->data = sep;
258 ssp->datatype = 1;
259
260 aip = AsnIoNew (ASNIO_TEXT_OUT, ofp, NULL, NULL, NULL);
261
262 SeqSubmitAsnWrite (ssp, aip, NULL);
263
264 pos = AsnIoTell (aip);
265 AsnIoFree (aip, FALSE);
266 fseek (ofp, pos, SEEK_SET);
267 fprintf (ofp, "\n");
268
269 ssp->data = NULL;
270 SeqEntryFree (sep);
271 }
272
273 static BioSourcePtr ReadBioSource (CharPtr path)
274
275 {
276 AsnIoPtr aip;
277 BioSourcePtr biop = NULL;
278
279 aip = AsnIoOpen (path, "r");
280 if (aip == NULL) return NULL;
281
282 biop = BioSourceAsnRead (aip, NULL);
283
284 AsnIoClose (aip);
285
286 return biop;
287 }
288
289 /* template file can contain either Seq-submit or Submit-block */
290
291 static SeqSubmitPtr ReadSubmitBlock (CharPtr path)
292
293 {
294 CitSubPtr csp;
295 Pointer dataptr;
296 Uint2 datatype;
297 FILE *fp;
298 SubmitBlockPtr sbp = NULL;
299 SeqSubmitPtr ssp = NULL;
300
301 fp = FileOpen (path, "r");
302 if (fp == NULL) return NULL;
303
304 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE,
305 FALSE, TRUE, FALSE);
306 FileClose (fp);
307
308 switch (datatype) {
309 case OBJ_SUBMIT_BLOCK :
310 sbp = (SubmitBlockPtr) dataptr;
311 ssp = SeqSubmitNew ();
312 if (ssp != NULL) {
313 ssp->sub = sbp;
314 }
315 break;
316 case OBJ_SEQSUB :
317 ssp = (SeqSubmitPtr) dataptr;
318 if (ssp != NULL) {
319 sbp = ssp->sub;
320 }
321 break;
322 default :
323 break;
324 }
325
326 if (sbp != NULL) {
327 csp = sbp->cit;
328 if (csp != NULL) {
329 csp->date = DateFree (csp->date);
330 csp->date = DateCurr ();
331 }
332 }
333
334 return ssp;
335 }
336
337 Args myargs [] = {
338 {"Filename for FASTA input", "stdin", NULL, NULL,
339 FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
340 {"Filename for Seq-submit template", NULL, NULL, NULL,
341 FALSE, 't', ARG_FILE_IN, 0.0, 0, NULL},
342 {"Filename for ASN.1 output", "stdout", NULL, NULL,
343 FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
344 {"Organism name", NULL, NULL, NULL,
345 TRUE, 'n', ARG_STRING, 0.0, 0, NULL},
346 {"General ID tag", NULL, NULL, NULL,
347 TRUE, 'g', ARG_STRING, 0.0, 0, NULL},
348 {"Sequences are mRNA?", "F", NULL, NULL,
349 TRUE, 'm', ARG_BOOLEAN, 0.0, 0, NULL},
350 {"HTGS phase?", "1", "0" ,"3",
351 FALSE, 'p', ARG_INT, 0.0, 0, NULL},
352 {"Parse colon in tag", "F", NULL, NULL,
353 TRUE, 'd', ARG_BOOLEAN, 0.0, 0, NULL},
354 {"Comment", NULL, NULL, NULL,
355 TRUE, 'c', ARG_STRING, 0.0, 0, NULL},
356 {"Filename for BioSource", NULL, NULL, NULL,
357 TRUE, 'b', ARG_FILE_IN, 0.0, 0, NULL},
358 {"Make identifier comment", "F", NULL, NULL,
359 TRUE, 'f', ARG_BOOLEAN, 0.0, 0, NULL},
360 };
361
362 Int2 Main (void)
363
364 {
365 BioSourcePtr biop = NULL;
366 Pointer dataptr;
367 Uint2 datatype;
368 CharPtr fasta_fname, template_fname, output_fname,
369 organism, generalid, comment, biosource_fname;
370 Int2 htgs_phase;
371 FILE *ifp, *ofp;
372 Boolean id_comment, is_mrna, parse_colon;
373 SeqSubmitPtr ssp;
374
375 ErrSetFatalLevel (SEV_FATAL);
376 ErrClearOptFlags (EO_SHOW_USERSTR);
377 UseLocalAsnloadDataAndErrMsg ();
378 ErrPathReset ();
379
380 if (! AllObjLoad ()) {
381 Message (MSG_FATAL, "AllObjLoad failed");
382 return 1;
383 }
384 if (! SubmitAsnLoad ()) {
385 Message (MSG_FATAL, "SubmitAsnLoad failed");
386 return 1;
387 }
388 if (! SeqCodeSetLoad ()) {
389 Message (MSG_FATAL, "SeqCodeSetLoad failed");
390 return 1;
391 }
392 if (! GeneticCodeTableLoad ()) {
393 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
394 return 1;
395 }
396
397 if (! GetArgs ("bulk2htgs", sizeof (myargs) / sizeof (Args), myargs)) {
398 return 0;
399 }
400
401 fasta_fname = myargs [0].strvalue;
402 template_fname = myargs [1].strvalue;
403 output_fname = myargs [2].strvalue;
404 organism = myargs [3].strvalue;
405 generalid = myargs [4].strvalue;
406 is_mrna = (Boolean) myargs [5].intvalue;
407 htgs_phase = (Int2) myargs [6].intvalue;
408 parse_colon = (Boolean) myargs [7].intvalue;
409 comment = myargs [8].strvalue;
410 biosource_fname = myargs [9].strvalue;
411 id_comment = (Boolean) myargs [10].intvalue;
412
413 if (StringHasNoText (output_fname)) {
414 Message (MSG_FATAL, "Unable to open output file");
415 return 1;
416 }
417
418 ssp = ReadSubmitBlock (template_fname);
419 if (ssp == NULL) {
420 Message (MSG_FATAL, "Unable to read template file");
421 return 1;
422 }
423 ssp->datatype = 1;
424
425 if (! StringHasNoText (biosource_fname)) {
426 biop = ReadBioSource (biosource_fname);
427 if (biop == NULL) {
428 Message (MSG_FATAL, "Unable to read BioSource file");
429 return 1;
430 }
431 }
432 if (biop == NULL && StringHasNoText (organism)) {
433 Message (MSG_FATAL, "Organism name or BioSource file is required for processing");
434 return 1;
435 }
436
437
438 ifp = FileOpen (fasta_fname, "r");
439 if (ifp == NULL) {
440 Message (MSG_FATAL, "Unable to open input file");
441 return 1;
442 }
443
444 ofp = FileOpen (output_fname, "w");
445 if (ofp == NULL) {
446 Message (MSG_FATAL, "Unable to create output file");
447 return 1;
448 }
449
450 while ((dataptr = ReadAsnFastaOrFlatFile (ifp, &datatype, NULL, FALSE,
451 FALSE, TRUE, FALSE)) != NULL) {
452 ProcessOneRecord (ssp, organism, biop, generalid, ofp,
453 is_mrna, htgs_phase, parse_colon, id_comment,
454 comment, datatype, dataptr);
455 }
456
457 FileClose (ofp);
458 FileClose (ifp);
459
460 BioSourceFree (biop);
461 SeqSubmitFree (ssp);
462
463 return 0;
464 }
465
466 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |