|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/demo/condense.c |
source navigation diff markup identifier search freetext search file search |
1 /*****************************************************************************
2 *
3 * condense.c
4 * entrez version
5 *
6 * "Fasta style" SeqIds include a string indicating the class of SeqId,
7 * vertical bar, then fields from the SeqId separated by vertical bar
8 * If an (OPTIONAL) field is missing, the vertical bar must still be
9 * there.
10 *
11 * local = lcl|integer or string
12 * gibbsq = bbs|integer
13 * gibbmt = bbm|integer
14 * giim = gim|integer
15 * genbank = gb|accession|locus
16 * embl = emb|accession|locus
17 * pir = pir|accession|name
18 * swissprot = sp|accession|name
19 * patent = pat|country|patent number (string)|seq number (integer)
20 * other = oth|accession|name|release
21 * general = gnl|database(string)|id (string or number)
22 * gi = gi|integer
23 * ddbj = dbj|accession|locus
24 * prf = prf|accession|name
25 * pdb = pdb|entry name (string)|chain id (char)
26 *
27 *****************************************************************************/
28 #include <ent2api.h>
29 #include <accpubseq.h>
30 #include <sequtil.h>
31 #include <tofasta.h>
32 #include <asn2ff.h>
33 #include <explore.h>
34
35 #define NUMARGS 6
36 Args myargs[NUMARGS] = {
37 {"Filename for output ","stdout", NULL,NULL,FALSE,'o',ARG_FILE_OUT, 0.0,0,NULL},
38 {"Verbose", "F", NULL, NULL, FALSE, 'v', ARG_BOOLEAN, 0.0, 0, NULL } ,
39 {"GI id for single Bioseq to dump" ,"0","1","99999999",TRUE,'g',ARG_INT,0.0,0,NULL},
40 {"Fasta style SeqId ENCLOSED IN QUOTES: lcl|int or str bbs|int bbm|int gb|acc|loc emb|acc|loc pir|acc|name sp|acc|name pat|country|patent|seq gi|int dbj|acc|loc prf|acc|name pdb|entry|chain ",
41 NULL,NULL,NULL,TRUE,'s',ARG_STRING,0.0,0,NULL},
42 {"Maximum number of neighbors to get" ,"100","1","60000",TRUE,'x',ARG_INT,0.0,0,NULL},
43 {"Minimum neighbor score to use" ,"2000","1","99999999",TRUE,'m',ARG_INT,0.0,0,NULL}
44 };
45
46 static char * norg = "Unknown";
47
48 #define PUB_LIMIT 50
49
50 typedef struct seqinfo {
51 BioseqPtr bsp;
52 Int4 gi;
53 SeqIdPtr sip;
54 Boolean is_pdb, is_protdb, is_refseq, is_model, is_patent;
55 Int4 taxid;
56 Int4 length;
57 CharPtr orgname;
58 Char Defline[80];
59 Int2 pubctr; /* number of citations */
60 Int4 PubMedId [PUB_LIMIT];
61 CitArtPtr cap [PUB_LIMIT];
62 ProtRefPtr prp;
63 GeneRefPtr grp;
64 SeqFeatPtr cds;
65 } SeqInfo, PNTR SeqInfoPtr;
66
67
68 void BioseqAnalyze (SeqInfoPtr seqs, BioseqPtr bsp, Int4 index);
69
70 typedef struct stringlist {
71 Int4 count;
72 Int4 id;
73 VoidPtr data;
74 Char * str;
75 } StrList, PNTR StrListPtr;
76
77 #define MAXSTR 1000
78 StrList sl[MAXSTR];
79 StrListPtr strp[MAXSTR];
80 Int4 strctr;
81
82 void ResetStrList(void)
83 {
84 strctr = 0;
85 return;
86 }
87
88 void AddStrToList(CharPtr str)
89 {
90 Int4 i;
91
92 if (strctr == MAXSTR) return;
93 if (str == NULL) return;
94 if (*str == '\0') return;
95
96 for (i = 0; i < strctr; i++)
97 {
98 if (! StringICmp(str, sl[i].str))
99 {
100 sl[i].count++;
101 return;
102 }
103 }
104 sl[strctr].str = str;
105 sl[strctr].count = 1;
106 strp[strctr] = &(sl[strctr]);
107 strctr++;
108 return;
109 }
110
111 void AddDataToList(Int4 ID, VoidPtr data)
112 {
113 Int4 i;
114
115 if (strctr == MAXSTR) return;
116 if (ID == 0) return;
117
118 for (i = 0; i < strctr; i++)
119 {
120 if (ID == sl[i].id)
121 {
122 sl[i].count++;
123 return;
124 }
125 }
126 sl[strctr].id = ID;
127 sl[strctr].count = 1;
128 sl[strctr].data = data;
129 strp[strctr] = &(sl[strctr]);
130 strctr++;
131 return;
132 }
133
134 int LIBCALLBACK StrListCountString (VoidPtr a, VoidPtr b)
135 {
136 StrListPtr ap, bp;
137 int retval = 0;
138
139 ap = * (StrListPtr *)a;
140 bp = * (StrListPtr *)b;
141
142 if (ap->count != bp->count)
143 return (int)(bp->count - ap->count);
144 return (int)StringICmp(ap->str, bp->str);
145 }
146
147 void SortByString(void)
148 {
149 Nlm_HeapSort((VoidPtr)strp, (size_t)strctr, sizeof(StrListPtr),
150 StrListCountString);
151 return;
152 }
153
154 int LIBCALLBACK StrListCountID (VoidPtr a, VoidPtr b)
155 {
156 StrListPtr ap, bp;
157 int retval = 0;
158
159 ap = * (StrListPtr *)a;
160 bp = * (StrListPtr *)b;
161
162 if (ap->count != bp->count)
163 return (int)(bp->count - ap->count);
164 return (int)(ap->id - bp->id);
165 }
166
167 void SortByID(void)
168 {
169 Nlm_HeapSort((VoidPtr)strp, (size_t)strctr, sizeof(StrListPtr),
170 StrListCountID);
171 return;
172 }
173
174 CharPtr GetGeneString (GeneRefPtr grp)
175 {
176 if (grp == NULL)
177 return "No Gene given";
178 if (grp->locus != NULL)
179 return (grp->locus);
180 else if (grp->desc != NULL)
181 return (grp->desc);
182 else if (grp->syn != NULL)
183 return (CharPtr)(grp->syn->data.ptrvalue);
184
185 return "No Gene Name Found";
186 }
187
188 CharPtr GetProtString (ProtRefPtr prp)
189 {
190 if (prp == NULL)
191 return "No Protein Given";
192 if (prp->name != NULL)
193 return (CharPtr) (prp->name->data.ptrvalue);
194 else if (prp->desc != NULL)
195 return (prp->desc);
196
197 return "No Protein Name Found";
198 }
199
200 Int2 Main(void)
201 {
202 Int2 retcode;
203 SeqIdPtr sip=NULL; /* Same as a ValNodePtr, generic data ptr implemented */
204 /* as a choice and a union. */
205
206 Int4 gi, i, num, j, ctr, year;
207 BioseqPtr query, bsp;
208 AsnIoPtr asnout=NULL;
209 FILE * fp=NULL;
210 Boolean is_network, doit;
211 Char tbuf[80];
212 CharPtr outmode, title, name;
213 ValNode vn;
214 CitArtPtr cap;
215 ValNodePtr vnp;
216 CitJourPtr cjp;
217 ImprintPtr ip;
218 Int2 pdbcnt = 0, protdbcnt = 0, refseqcnt=0, modelcnt=0, patentcnt=0;
219 Boolean verbose;
220 Entrez2RequestPtr e2rq;
221 Entrez2ReplyPtr e2ry;
222 Entrez2LinkSetPtr e2lp;
223 Int4Ptr linkuids, linkscores;
224 Int4 MaxSeq, MinScore, LowestScore, HighestScore;
225 SeqInfoPtr seqs;
226
227 /*
228 ** Get program arguments
229 */
230
231 if ( !GetArgs("Condense 1.0", NUMARGS, myargs) ) return 1;
232
233 /*
234 ** Set parameters from the command line
235 */
236
237 verbose = (Boolean)myargs[1].intvalue;
238 gi = myargs[2].intvalue;
239 retcode = 0;
240 MaxSeq = myargs[4].intvalue;
241 MinScore = myargs[5].intvalue;
242 LowestScore = INT4_MAX;
243 HighestScore = 0;
244
245 if (myargs[3].strvalue != NULL)
246 {
247 if (gi)
248 {
249 ErrPostEx(SEV_FATAL, 1,0, "Use only one of -g or -s");
250 return 1;
251 }
252
253 sip = SeqIdParse((CharPtr)(myargs[3].strvalue));
254 if (sip == NULL)
255 {
256 ErrPostEx(SEV_FATAL, 1,0, "Can't parse [%s]",
257 (CharPtr)(myargs[3].strvalue));
258 return 1;
259 }
260 }
261 else if (! gi)
262 {
263 ErrPostEx(SEV_FATAL, 1,0, "Must supply one of -g or -s");
264 return 1;
265 }
266
267 /*
268 ** Initialize, open
269 */
270
271 if ( !PUBSEQBioseqFetchEnable("Condense", TRUE) ) {
272 ErrPostEx(SEV_FATAL, 1,0, "Can't initialize PUBSEQ");
273 return 1;
274 }
275
276 if (sip != NULL)
277 {
278 gi = GetGIForSeqId(sip);
279 if (! gi)
280 {
281 PUBSEQFini();
282 SeqIdWrite(sip, tbuf, PRINTID_FASTA_SHORT,40);
283 ErrPostEx(SEV_FATAL, 1,0, "Couldn't find SeqId [%s]", tbuf);
284 return 1;
285 }
286 SeqIdFree(sip);
287 }
288
289 vn.choice = SEQID_GI;
290 vn.next = NULL;
291 vn.data.intvalue = gi;
292
293 query = BioseqLockById(&vn);
294
295
296 if (query == NULL)
297 {
298 ErrPostEx(SEV_FATAL, 1,0,"Could not retrieve entry for GI %ld", (long)gi);
299 return 1;
300 }
301
302 /** get the neighbors ***/
303
304 EntrezSetProgramName ("Condense");
305
306 /***
307 EntrezSetServer ("pluto", 5701, "/entrez/olegh/e2s.cgi");
308 ***/
309
310 e2rq = EntrezCreateGetLinksRequest ( "protein", 0, 1, &gi, NULL,
311 "protein_protein", MaxSeq, FALSE, TRUE);
312
313 if (e2rq == NULL)
314 printf("Couldn't create link request\n");
315
316 if ((e2ry = EntrezSynchronousQuery(e2rq)) == NULL)
317 printf("Link query failed\n");
318
319 if ((e2lp = EntrezExtractLinksReply(e2ry)) == NULL)
320 printf("Couldn't extract links from reply\n");
321
322 if (e2lp == NULL)
323 printf("Got NULL LinkSet\n");
324 else
325 {
326 num = e2lp->ids->num;
327 linkuids = (Int4Ptr) BSMerge(e2lp->ids->uids, NULL);
328 linkscores = (Int4Ptr) BSMerge(e2lp->data, NULL);
329
330 if (verbose)
331 printf ("Got %ld neighbors\n", (long)(num));
332 if (num > MaxSeq)
333 num = MaxSeq;
334
335 seqs = (SeqInfoPtr) MemNew((size_t)((num + 1) * sizeof(SeqInfo)));
336
337 BioseqAnalyze(seqs, query, 0);
338 ctr = 1;
339 for (i = 0; (i < num) && ((linkscores[i]) >= MinScore); i++)
340 {
341 doit = TRUE;
342 gi = linkuids[i];
343 for (j = 0; j < num; j++)
344 {
345 if (seqs[j].gi == gi)
346 {
347 doit = FALSE;
348 break;
349 }
350 }
351 if (doit)
352 {
353 vn.data.intvalue = gi;
354 bsp = BioseqLockById(&vn);
355 if (bsp == NULL)
356 printf ("Couldn't fetch %ld\n", (long)(gi));
357 else if (verbose)
358 printf ("[%d] Got %ld weight = %ld\n", (int)i, (long)(gi),
359 (long)(linkscores[i]));
360 BioseqAnalyze (seqs, bsp, ctr);
361 if (linkscores[i] < LowestScore)
362 LowestScore = linkscores[i];
363 if (linkscores[i] > HighestScore)
364 HighestScore = linkscores[i];
365 ctr++;
366 }
367 }
368 }
369
370 fp = FileOpen((CharPtr)myargs[0].strvalue, "w");
371 for (i = 0; i < ctr; i++)
372 {
373 if (verbose)
374 fprintf(stdout, "[%ld] %s\n", seqs[i].gi, seqs[i].Defline);
375 SeqIdWrite(seqs[i].sip, tbuf, PRINTID_FASTA_SHORT, 40);
376 if (verbose)
377 fprintf(stdout, " %s %s\n", tbuf, seqs[i].orgname);
378 for (j = 0; j < seqs[i].pubctr; j++)
379 {
380 title = NULL;
381 cap = seqs[i].cap[j];
382 if (cap != NULL)
383 {
384 for (vnp = cap->title; vnp != NULL; vnp = vnp->next)
385 {
386 switch (vnp->choice)
387 {
388 case 1: /* name */
389 if (title == NULL)
390 title = vnp->data.ptrvalue;
391 break;
392 case 3: /* trans */
393 title = vnp->data.ptrvalue;
394 break;
395 default:
396 break;
397 }
398 }
399 if (title == NULL)
400 title = "No Title";
401
402 if (verbose)
403 fprintf(stdout, " [%ld] %s\n", seqs[i].PubMedId[j], title);
404 } else if (verbose)
405 fprintf(stdout, " [%ld] CitArt is NULL\n", seqs[i].PubMedId[j]);
406 if (cap != NULL)
407 {
408 vn.choice = PUB_Article;
409 vn.data.ptrvalue = cap;
410 PubLabel(&vn, tbuf, 70, OM_LABEL_CONTENT);
411 if (verbose)
412 fprintf(stdout, " [%s]\n", tbuf);
413 }
414 }
415 }
416
417 /**** real report ****/
418 SeqIdWrite(seqs[0].sip, tbuf, PRINTID_FASTA_SHORT, 40);
419
420 fprintf(fp, "Report for %s %s\n", tbuf, seqs[0].Defline);
421 fprintf(fp, "Organism=[%s] ProtName=[%s] GeneName=[%s]\n", seqs[0].orgname,
422 GetProtString(seqs[0].prp), GetGeneString(seqs[0].grp));
423 if (seqs[0].cds != NULL)
424 fprintf(fp, "A nucleic acid sequence exists for this protein\n");
425 fprintf(fp, "\nEvaluating %ld neighbor sequences with scores from %ld to %ld\n\n",
426 (long)(ctr), (long)HighestScore, (long)LowestScore);
427 for (i = 0; i < ctr; i++)
428 {
429 if (seqs[i].is_pdb)
430 pdbcnt++;
431 if (seqs[i].is_protdb)
432 protdbcnt++;
433 if (seqs[i].is_refseq)
434 refseqcnt++;
435 if (seqs[i].is_model)
436 modelcnt++;
437 if (seqs[i].is_patent)
438 patentcnt++;
439 }
440 if (pdbcnt) fprintf(fp, "[%ld] PDB; ", (long)pdbcnt);
441 if (protdbcnt) fprintf(fp, "[%ld] Protein Dbs; ", (long)protdbcnt);
442 if (refseqcnt) fprintf(fp, "[%ld] RefSeq; ", (long)refseqcnt);
443 if (modelcnt) fprintf(fp, "[%ld] Models; ", (long)modelcnt);
444 if (patentcnt) fprintf(fp, "[%ld] Patents; ", (long)patentcnt);
445 fprintf(fp, "\n");
446
447 ResetStrList();
448 num = 0;
449 for (i = 0; i < ctr; i++)
450 {
451 if (seqs[i].taxid == seqs[0].taxid)
452 num++;
453 else if (seqs[i].taxid != 0)
454 AddStrToList(seqs[i].orgname);
455 }
456 fprintf(fp, "[%ld] were from %s. [%ld] were from other organisms.\n", (long)num,
457 seqs[0].orgname, (long)(ctr - num));
458
459 SortByString();
460 for (i = 0; i < strctr; i++)
461 fprintf(fp, " [%ld] %s\n", (long)(strp[i]->count), strp[i]->str);
462 fprintf(fp, "\n");
463
464 ResetStrList();
465 fprintf(fp, "Protein names used were:\n");
466 for (i = 0; i < ctr; i++)
467 {
468 AddStrToList(GetProtString(seqs[i].prp));
469 }
470 SortByString();
471 for (i = 0; i < strctr; i++)
472 fprintf(fp, " [%ld] %s\n", (long)(strp[i]->count), strp[i]->str);
473
474 fprintf(fp, "\n");
475
476 ResetStrList();
477 fprintf(fp, "Gene names used were:\n");
478 for (i = 0; i < ctr; i++)
479 {
480 AddStrToList(GetGeneString(seqs[i].grp));
481 }
482 SortByString();
483 for (i = 0; i < strctr; i++)
484 fprintf(fp, " [%ld] %s\n", (long)(strp[i]->count), strp[i]->str);
485
486 fprintf(fp, "\n");
487
488 ResetStrList();
489 fprintf(fp, "Publications cited were:\n");
490 for (i = 0; i < ctr; i++)
491 {
492 for (j = 0; j < PUB_LIMIT; j++)
493 {
494 if (seqs[i].cap[j] != NULL)
495 AddDataToList(seqs[i].PubMedId[j], seqs[i].cap[j]);
496 }
497 }
498 SortByID();
499 for (i = 0; i < strctr; i++)
500 {
501 cap = (CitArtPtr)(strp[i]->data);
502 vn.choice = PUB_Article;
503 vn.data.ptrvalue = cap;
504 PubLabel(&vn, tbuf, 70, OM_LABEL_CONTENT);
505 fprintf(fp, "\n[%ld] %s\n", (long)(strp[i]->count), tbuf);
506 title = NULL;
507 for (vnp = cap->title; vnp != NULL; vnp = vnp->next)
508 {
509 switch (vnp->choice)
510 {
511 case 1: /* name */
512 if (title == NULL)
513 title = vnp->data.ptrvalue;
514 break;
515 case 3: /* trans */
516 title = vnp->data.ptrvalue;
517 break;
518 default:
519 break;
520 }
521 }
522 if (title != NULL)
523 fprintf(fp, "%s\n", title);
524 }
525
526 PUBSEQFini();
527 FileClose(fp);
528
529
530 return 0;
531 }
532
533 void BioseqAnalyze (SeqInfoPtr seqs, BioseqPtr bsp, Int4 index)
534 {
535 SeqInfoPtr sip;
536 SeqIdPtr tsip;
537 SeqMgrDescContext smc;
538 SeqMgrFeatContext smf;
539 ValNodePtr vnp, xp;
540 SeqFeatPtr sfp, cds;
541 BioSourcePtr bp;
542 OrgRefPtr orp;
543 PubdescPtr pdp;
544 CitArtPtr cap;
545 Int2 ctr;
546 Boolean doit;
547 GeneRefPtr grp;
548 ProtRefPtr prp;
549
550 if (bsp == NULL)
551 return;
552
553 SeqMgrIndexFeatures(0, bsp);
554 sip = &(seqs[index]);
555 for (tsip = bsp->id; tsip != NULL; tsip = tsip->next)
556 {
557 switch (tsip->choice)
558 {
559 case SEQID_GI:
560 sip->gi = tsip->data.intvalue;
561 break;
562 case SEQID_GENBANK:
563 case SEQID_EMBL:
564 case SEQID_DDBJ:
565 sip->sip = tsip;
566 break;
567 case SEQID_SWISSPROT:
568 case SEQID_PIR:
569 case SEQID_PRF:
570 sip->sip = tsip;
571 sip->is_protdb = TRUE;
572 break;
573 case SEQID_PDB:
574 sip->sip = tsip;
575 sip->is_pdb = TRUE;
576 break;
577 case SEQID_PATENT:
578 sip->is_patent = TRUE;
579 break;
580 case SEQID_OTHER:
581 sip->sip = tsip;
582 if (*((TextSeqIdPtr)(tsip->data.ptrvalue))->accession == 'N')
583 sip->is_refseq = TRUE;
584 else
585 sip->is_model = TRUE;
586 break;
587 default:
588 break;
589
590 }
591 }
592 sip->length = bsp->length;
593
594 for (vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &smc);
595 vnp != NULL;
596 vnp = SeqMgrGetNextDescriptor (bsp, vnp, Seq_descr_source, &smc))
597 {
598 bp = (BioSourcePtr)(vnp->data.ptrvalue);
599 if (bp->org != NULL)
600 {
601 orp = bp->org;
602 if ((bp->is_focus) || (sip->taxid == 0))
603 {
604 for (xp = orp->db; xp != NULL; xp = xp->next)
605 {
606 if (! StringICmp(((DbtagPtr)(xp->data.ptrvalue))->db, "Taxon"))
607 sip->taxid = ((DbtagPtr)(xp->data.ptrvalue))->tag->id;
608 }
609 sip->orgname = orp->taxname;
610 }
611 }
612 }
613
614 CreateDefLine(NULL, bsp, sip->Defline, 70, 0, NULL, sip->orgname);
615 if (sip->orgname == NULL)
616 sip->orgname = "Unknown Organism";
617
618 for (vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_pub, &smc);
619 vnp != NULL;
620 vnp = SeqMgrGetNextDescriptor (bsp, vnp, Seq_descr_pub, &smc))
621 {
622 pdp = (PubdescPtr)(vnp->data.ptrvalue);
623 doit = FALSE;
624 ctr = sip->pubctr;
625 for (xp = pdp->pub; xp != NULL && ctr < PUB_LIMIT; xp = xp->next)
626 {
627 switch (xp->choice)
628 {
629 case PUB_Muid:
630 if (sip->PubMedId[ctr] != 0)
631 break;
632 case PUB_PMid:
633 sip->PubMedId[ctr] = xp->data.intvalue;
634 doit = TRUE;
635 break;
636 case PUB_Article:
637 sip->cap[ctr] = (CitArtPtr)(xp->data.ptrvalue);
638 doit = TRUE;
639 break;
640 default:
641 break;
642 }
643 }
644 if (doit){ /* saved one */
645 sip->pubctr++;
646 }
647 if ( sip->pubctr >= PUB_LIMIT){
648 break;
649 }
650
651 }
652
653 cds = SeqMgrGetCDSgivenProduct(bsp, &smf);
654 sip->cds = cds;
655 prp = NULL;
656 grp = NULL;
657 sfp = SeqMgrGetBestProteinFeature(bsp, &smf);
658 if (sfp != NULL)
659 {
660 prp = (ProtRefPtr)(sfp->data.value.ptrvalue);
661 grp = SeqMgrGetGeneXref(sfp);
662 }
663 if ((grp == NULL) && (cds != NULL))
664 {
665 grp = SeqMgrGetGeneXref(cds);
666 if (grp == NULL)
667 {
668 sfp = SeqMgrGetOverlappingGene(cds->location, &smf);
669 if (sfp != NULL)
670 grp = (GeneRefPtr)(sfp->data.value.ptrvalue);
671 }
672 }
673 sip->prp = prp;
674 sip->grp = grp;
675
676
677 return;
678
679 }
680 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |