NCBI C Toolkit Cross Reference

C/demo/releasescan.c


  1 /*   releasescan.c
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *            National Center for Biotechnology Information (NCBI)
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's official duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government do not place any restriction on its use or reproduction.
 13 *  We would, however, appreciate having the NCBI and the author cited in
 14 *  any work or product based on this material
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 * ===========================================================================
 25 *
 26 * File Name:  releasescan.c
 27 *
 28 * Author:  Jonathan Kans
 29 *
 30 * Version Creation Date:   6/6/00
 31 *
 32 * $Revision: 6.2 $
 33 *
 34 * File Description: 
 35 *
 36 * Modifications:  
 37 * --------------------------------------------------------------------------
 38 * Date     Name        Description of modification
 39 * -------  ----------  -----------------------------------------------------
 40 *
 41 *
 42 * ==========================================================================
 43 */
 44 
 45 /* scans binary ASN.1 Bioseq-set release files */
 46 
 47 #include <ncbi.h>
 48 #include <objall.h>
 49 #include <objsset.h>
 50 #include <objsub.h>
 51 #include <objfdef.h>
 52 #include <seqport.h>
 53 #include <sequtil.h>
 54 #include <sqnutils.h>
 55 #include <subutil.h>
 56 #include <tofasta.h>
 57 #include <gather.h>
 58 #include <toasn3.h>
 59 #include <explore.h>
 60 
 61 static Boolean LIBCALLBACK DoFeat (SeqFeatPtr sfp, SeqMgrFeatContextPtr context)
 62 
 63 {
 64   DbtagPtr     dbt;
 65   GBQualPtr    gbq;
 66   GeneRefPtr   grp;
 67   CharPtr      label;
 68   ObjectIdPtr  oip;
 69   BioseqPtr    prod;
 70   SeqFeatPtr   prot;
 71   ProtRefPtr   prp;
 72   RnaRefPtr    rrp;
 73   SeqIdPtr     sip;
 74   SeqLocPtr    slp;
 75   Int4         start;
 76   Int4         stop;
 77   Char         str [256];
 78   tRNAPtr      trna;
 79   ValNodePtr   vnp;
 80 
 81   FILE  *fp;
 82 
 83   fp = (FILE *) context->userdata;
 84 
 85   label = (CharPtr) FeatDefTypeLabel (sfp);
 86   if (StringCmp (label, "Gene") == 0) {
 87     label = "gene";
 88   }
 89   if (StringHasNoText (label)) {
 90     label = "???";
 91   }
 92 
 93   slp = SeqLocFindNext (sfp->location, NULL);
 94   if (slp == NULL) return TRUE;
 95 
 96   start = GetOffsetInBioseq (slp, context->bsp, SEQLOC_START) + 1;
 97   stop = GetOffsetInBioseq (slp, context->bsp, SEQLOC_STOP) + 1;
 98   fprintf (fp, "%ld\t%ld\t%s\n", (long) start, (long) stop, label);
 99 
100   while ((slp = SeqLocFindNext (sfp->location, slp)) != NULL) {
101     start = GetOffsetInBioseq (slp, context->bsp, SEQLOC_START) + 1;
102     stop = GetOffsetInBioseq (slp, context->bsp, SEQLOC_STOP) + 1;
103     if (start != 0 && stop != 0) {
104       fprintf (fp, "%ld\t%ld\n", (long) start, (long) stop);
105     }
106   }
107 
108   switch (context->seqfeattype) {
109     case SEQFEAT_GENE :
110       grp = (GeneRefPtr) sfp->data.value.ptrvalue;
111       if (grp != NULL) {
112         StringNCpy_0 (str, (CharPtr) grp->locus, sizeof (str));
113         if (! StringHasNoText (str)) {
114           fprintf (fp, "\t\t\tgene\t%s\n", str);
115         }
116         for (vnp = grp->syn; vnp != NULL; vnp = vnp->next) {
117           StringNCpy_0 (str, (CharPtr) vnp->data.ptrvalue, sizeof (str));
118           if (! StringHasNoText (str)) {
119             fprintf (fp, "\t\t\tgene_syn\t%s\n", str);
120           }
121         }
122       }
123       break;
124     case SEQFEAT_CDREGION :
125       prod = BioseqFind (SeqLocId (sfp->product));
126       prot = SeqMgrGetBestProteinFeature (prod, NULL);
127       if (prot != NULL) {
128         prp = (ProtRefPtr) prot->data.value.ptrvalue;
129         if (prp != NULL) {
130           if (prp->name != NULL) {
131             for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
132               StringNCpy_0 (str, (CharPtr) vnp->data.ptrvalue, sizeof (str));
133               if (! StringHasNoText (str)) {
134                 fprintf (fp, "\t\t\tproduct\t%s\n", str);
135               }
136             }
137           } else if (prp->desc != NULL) {
138             StringNCpy_0 (str, prp->desc, sizeof (str));
139             if (! StringHasNoText (str)) {
140               fprintf (fp, "\t\t\tproduct\t%s\n", str);
141             }
142           }
143           for (vnp = prp->activity; vnp != NULL; vnp = vnp->next) {
144             StringNCpy_0 (str, (CharPtr) vnp->data.ptrvalue, sizeof (str));
145             if (! StringHasNoText (str)) {
146               fprintf (fp, "\t\t\tfunction\t%s\n", str);
147             }
148           }
149           for (vnp = prp->ec; vnp != NULL; vnp = vnp->next) {
150             StringNCpy_0 (str, (CharPtr) vnp->data.ptrvalue, sizeof (str));
151             if (! StringHasNoText (str)) {
152               fprintf (fp, "\t\t\tEC_number\t%s\n", str);
153             }
154           }
155         }
156       }
157       if (prod != NULL) {
158         for (sip = prod->id; sip != NULL; sip = sip->next) {
159           if (sip->choice == SEQID_GENBANK ||
160               sip->choice == SEQID_EMBL ||
161               sip->choice == SEQID_DDBJ) {
162             if (SeqIdWrite (sip, str, PRINTID_TEXTID_ACC_VER, sizeof (str)) != NULL) {
163               fprintf (fp, "\t\t\tprotein_id\t%s\n", str);
164             }
165           }
166         }
167       }
168       break;
169     case SEQFEAT_RNA :
170       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
171       if (rrp != NULL) {
172         switch (rrp->ext.choice) {
173           case 1 :
174             StringNCpy_0 (str, (CharPtr) rrp->ext.value.ptrvalue, sizeof (str));
175             if (! StringHasNoText (str)) {
176               fprintf (fp, "\t\t\tproduct\t%s\n", str);
177             }
178             break;
179           case 2 :
180             trna = rrp->ext.value.ptrvalue;
181             if (trna != NULL) {
182               FeatDefLabel (sfp, str, sizeof (str) - 1, OM_LABEL_CONTENT);
183               if (! StringHasNoText (str)) {
184                 fprintf (fp, "\t\t\tproduct\t%s\n", str);
185               }
186             }
187             break;
188           default :
189             break;
190         }
191       }
192       break;
193     default :
194       break;
195   }
196   if (! StringHasNoText (sfp->comment)) {
197     fprintf (fp, "\t\t\tnote\t%s\n", sfp->comment);
198   }
199   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
200     if (! StringHasNoText (gbq->qual)) {
201       if (! StringHasNoText (gbq->val)) {
202         fprintf (fp, "\t\t\t%s\t%s\n", gbq->qual, gbq->val);
203       }
204     }
205   }
206   for (vnp = sfp->dbxref; vnp != NULL; vnp = vnp->next) {
207     dbt = (DbtagPtr) vnp->data.ptrvalue;
208     if (dbt != NULL) {
209       if (! StringHasNoText (dbt->db)) {
210         oip = dbt->tag;
211         if (oip->str != NULL && (! StringHasNoText (oip->str))) {
212           fprintf (fp, "\t\t\tdb_xref\t%s:%s\n", dbt->db, oip->str);
213         } else {
214           fprintf (fp, "\t\t\tdb_xref\t%s:%ld\n", dbt->db, (long) oip->id);
215         }
216       }
217     }
218   }
219 
220   return TRUE;
221 }
222 
223 static void DoBioseq (BioseqPtr bsp, Pointer userdata)
224 
225 {
226   FILE     *fp;
227   Char     str [41];
228   CharPtr  tmp;
229   ValNode  vn;
230 
231   /* do not process protein bioseqs here */
232 
233   if (! ISA_na (bsp->mol)) return;
234 
235   fp = (FILE *) userdata;
236 
237   if (bsp->repr == Seq_repr_seg) {
238 
239     /* print FASTA ID chain and SeqLoc for segmented bioseq */
240 
241     MemSet ((Pointer) &vn, 0, sizeof (ValNode));
242     vn.choice = SEQLOC_MIX;
243     vn.data.ptrvalue = bsp->seq_ext;
244     tmp = SeqLocPrint ((SeqLocPtr) &vn);
245     if (tmp != NULL) {
246       SeqIdWrite (bsp->id, str, PRINTID_FASTA_LONG, sizeof (str));
247       fprintf (fp, ">%s %s\n", str, tmp);
248     }
249     MemFree (tmp);
250 
251   } else {
252 
253     /* normal FASTA output, including for raw parts of segmented bioseq */
254 
255     BioseqToFasta (bsp, fp, ISA_na (bsp->mol));
256   }
257 
258   /* features on parts are indexed on segmented parent coordinates */
259 
260   if (SeqMgrGetParentOfPart (bsp, NULL) != NULL) return;
261 
262   /* visit features indexed on this bioseq */
263 
264   SeqMgrExploreFeatures (bsp, userdata, DoFeat, NULL, NULL, NULL);
265 }
266 
267 static void DoRecord (SeqEntryPtr sep, Pointer userdata)
268 
269 {
270   /* index features on all bioseqs in current record */
271 
272   SeqMgrIndexFeatures (0, sep->data.ptrvalue);
273 
274   /* explore record, visit every bioseq */
275 
276   VisitBioseqsInSep (sep, userdata, DoBioseq);
277 
278   /* record cleaned up by ScanBioseqSetRelease */
279 }
280 
281 /* command-line argument list */
282 
283 #define p_argInputPath  0
284 #define o_argOutputFile 1
285 #define x_argFileSelect 2
286 #define b_argBinaryFile 3
287 #ifdef OS_UNIX
288 #define c_argCompressed 4
289 #endif
290 
291 Args myargs [] = {
292   {"Path to files", NULL, NULL, NULL,
293     TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
294   {"Output File Name", "stdout", NULL, NULL,
295     FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
296   {"File selection substring", ".aso", NULL, NULL,
297     TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
298   {"Binary file", "T", NULL, NULL,
299     TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
300 #ifdef OS_UNIX
301   {"Compressed file", "F", NULL, NULL,
302     TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
303 #endif
304 };
305 
306 /* toolkit ncbimain.c has C main function, wraps application Main */
307 
308 Int2 Main (void)
309 
310 {
311   Boolean     binary, compressed = FALSE;
312   CharPtr     dir, progname, str, subfile;
313   FILE        *fp;
314   ValNodePtr  head, vnp;
315   Char        path [PATH_MAX];
316 
317   ErrSetFatalLevel (SEV_FATAL);
318   ErrClearOptFlags (EO_SHOW_USERSTR);
319   UseLocalAsnloadDataAndErrMsg ();
320   ErrPathReset ();
321 
322   /* resolve internal pointers in object loader parse tables */
323 
324   if (! AllObjLoad ()) {
325     Message (MSG_FATAL, "AllObjLoad failed");
326     return 1;
327   }
328   if (! SubmitAsnLoad ()) {
329     Message (MSG_FATAL, "SubmitAsnLoad failed");
330     return 1;
331   }
332   if (! SeqCodeSetLoad ()) {
333     Message (MSG_FATAL, "SeqCodeSetLoad failed");
334     return 1;
335   }
336   if (! GeneticCodeTableLoad ()) {
337     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
338     return 1;
339   }
340 
341   ProgramPath (path, sizeof (path));
342   progname = StringRChr (path, DIRDELIMCHR);
343   if (progname != NULL) {
344     progname++;
345   } else {
346     progname = "releasescan";
347   }
348 
349   /* process command-line arguments */
350 
351   if (! GetArgs (progname, sizeof (myargs) / sizeof (Args), myargs)) {
352     return 0;
353   }
354 
355   dir = myargs [p_argInputPath].strvalue;
356   binary = (Boolean) myargs [b_argBinaryFile].intvalue;
357 #ifdef OS_UNIX
358   compressed = (Boolean) myargs [c_argCompressed].intvalue;
359 #endif
360 
361 #ifndef OS_UNIX
362   if (compressed) {
363     Message (MSG_ERROR, "Can only decompress on-the-fly on UNIX machines");
364     return 1;
365   }
366 #endif
367 
368   fp = FileOpen (myargs [o_argOutputFile].strvalue, "a");
369   if (fp == NULL) {
370     Message (MSG_FATAL, "FileOpen failed");
371     return 1;
372   }
373 
374   head = DirCatalog (dir);
375 
376   /* process appropriate files within specified directory */
377 
378   for (vnp = head; vnp != NULL; vnp = vnp->next) {
379 
380     /* vnp->choice is 0 for file, 1 for subdirectory */
381 
382     if (vnp->choice == 0) {
383       str = (CharPtr) vnp->data.ptrvalue;
384       if (! StringHasNoText (str)) {
385         subfile = myargs [x_argFileSelect].strvalue;
386 
387         /* does filename have desired substring? */
388 
389         if (StringHasNoText (subfile) || StringStr (str, subfile) != NULL) {
390 #ifdef OS_UNIX
391           /* printf ("%s\n", str); */
392 #endif
393 
394           /* open a file, read one record at a time, present it to callback */
395 
396           StringNCpy_0 (path, dir, sizeof (path));
397           FileBuildPath (path, NULL, str);
398 
399           ScanBioseqSetRelease (path, binary, compressed, (Pointer) fp, DoRecord);
400         }
401       }
402     }
403   }
404 
405   ValNodeFreeData (head);
406 
407   FileClose (fp);
408 
409   return 0;
410 }
411 
412 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.