|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/tools/dust.c |
source navigation diff markup identifier search freetext search file search |
1 static char const rcsid[] = "$Id: dust.c,v 6.5 2003/05/30 17:25:36 coulouri Exp $";
2
3 /* dust.c
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: dust.c
29 *
30 * Author(s): Roma Tatusov, John Kuzio
31 *
32 * Version Creation Date: 5/26/95
33 *
34 * $Revision: 6.5 $
35 *
36 * File Description: a utility to find low complexity NA regions
37 *
38 * Modifications:
39 * --------------------------------------------------------------------------
40 * Date Name Description of modification
41 * ------- ---------- -----------------------------------------------------
42 *
43 * $Log: dust.c,v $
44 * Revision 6.5 2003/05/30 17:25:36 coulouri
45 * add rcsid
46 *
47 * Revision 6.4 1999/08/18 17:50:47 kans
48 * include dust.h only after other headers are included to provide prototypes on Mac
49 *
50 * Revision 6.3 1999/08/18 16:26:17 sicotte
51 * Optimized by Moving a Malloc/Free outside of calling functions and other misc optimizations
52 *
53 * Revision 6.2 1999/03/12 17:03:29 kuzio
54 * cast
55 *
56 * Revision 6.1 1998/09/24 15:26:39 egorov
57 * Fix lint complaints
58 *
59 * Revision 6.0 1997/08/25 18:53:04 madden
60 * Revision changed to 6.0
61 *
62 * Revision 5.5 1997/04/24 20:50:41 kuzio
63 * quantify speed up -- remove useless loop
64 *
65 * Revision 5.4 1997/03/12 21:43:48 kuzio
66 * for interactive dusting: FilterSeq from: ,Int4 window, to: ,Int4Ptr exval,
67 *
68 * Revision 5.3 1997/02/26 15:59:55 kuzio
69 * cast: wo1 ((Int2) (flen-i), seq+i, i, cloc);
70 *
71 * Revision 5.2 1997/02/14 22:29:48 kuzio
72 * filter function updates
73 *
74 * Revision 5.1 1997/02/11 00:20:45 kuzio
75 * support for graphs added
76 *
77 * Revision 5.0 1996/05/28 13:43:15 ostell
78 * Set to revision 5.0
79 *
80 * Revision 4.3 1996/01/22 18:25:58 kuzio
81 * function info in err code levels
82 *
83 * Revision 4.2 1995/12/07 18:56:19 kuzio
84 * Trimmed error reports for invalid residues
85 *
86 * Revision 4.1 1995/11/01 12:31:59 kuzio
87 * fixed nested comment line 491
88 *
89 * Revision 4.0 1995/07/26 13:50:15 ostell
90 * force revision to 4.0
91 *
92 * Revision 1.2 1995/07/05 14:06:22 kuzio
93 * fix EOS typo in dusttripfind array setup
94 *
95 * Revision 1.1 1995/05/26 18:47:47 kuzio
96 * Initial revision
97 *
98 *
99 * ==========================================================================
100 */
101
102 #include <sequtil.h>
103 #include <objmgr.h>
104 #include <seqport.h>
105 #include <dust.h>
106
107 static char _this_module[] = "dust";
108 #undef THIS_MODULE
109 #define THIS_MODULE _this_module
110 static char _this_file[] = __FILE__;
111 #undef THIS_FILE
112 #define THIS_FILE _this_file
113
114 /* local, file scope, structures and variables */
115
116 typedef struct endpoints {
117 struct endpoints PNTR next;
118 Int4 from, to;
119 } DREGION;
120
121 typedef struct localcurrents {
122 Int4 curlevel, curstart, curend;
123 } DCURLOC;
124
125 /* local functions */
126
127 static Int4 dust_segs PROTO ((Int4, SeqPortPtr, Int4, DREGION PNTR,
128 Int4, Int4, Int4, Int4));
129 static Int4 wo PROTO ((Int4, SeqPortPtr, Int4, DCURLOC PNTR, Int4 PNTR, UcharPtr seq));
130 static void wo1 PROTO ((Int4, UcharPtr, Int4, DCURLOC PNTR));
131 static Int4 dusttripfind PROTO ((SeqPortPtr, UcharPtr, Int4, Int4,
132 Int4 PNTR));
133 static SeqLocPtr slpDust PROTO ((SeqPortPtr, SeqLocPtr, SeqIdPtr,
134 ValNodePtr PNTR, DREGION PNTR,
135 Int4, Int4));
136
137 /* a simple bioseq */
138
139 SeqLocPtr BioseqDust (BioseqPtr bsp, Int4 start, Int4 end,
140 Int2 level, Int2 window, Int2 minwin, Int2 linker)
141 {
142 SeqLocPtr slp = NULL; /* initialize */
143 ValNodePtr vnp = NULL;
144
145 SeqPortPtr spp;
146
147 DREGION PNTR reg, PNTR regold;
148 Int4 nreg;
149 Int4 l;
150 Int4 loopDustMax = 1;
151
152 /* error msg stuff */
153 /* ErrSetOptFlags (EO_MSG_CODES | EO_SHOW_FILELINE | EO_BEEP); */
154 ErrSetOptFlags (EO_MSG_CODES);
155
156 /* make sure bioseq is there */
157 if (!bsp)
158 {
159 ErrPostEx (SEV_ERROR, 1, 1,
160 "no bioseq");
161 ErrShow ();
162 return slp;
163 }
164 if (!ISA_na (bsp->mol))
165 {
166 ErrPostEx (SEV_WARNING, 1, 2,
167 "not nucleic acid");
168 ErrShow ();
169 return slp;
170 }
171
172 /* place for dusted regions */
173 reg = MemNew (sizeof (DREGION));
174 if (!reg)
175 {
176 ErrPostEx (SEV_FATAL, 1, 3,
177 "memory allocation error");
178 ErrShow ();
179 return slp;
180 }
181 reg->from = 0;
182 reg->to = 0;
183 reg->next = NULL;
184
185 /* do it */
186 spp = SeqPortNew (bsp, start, end, 0, Seq_code_ncbi2na);
187 if (!spp)
188 {
189 ErrPostEx (SEV_ERROR, 1, 4,
190 "sequence port open failed");
191 ErrShow ();
192 MemFree (reg);
193 return slp;
194 }
195
196 l = spp->totlen;
197 nreg = dust_segs (l, spp, start, reg, (Int4)level, (Int4)window, (Int4)minwin, (Int4)linker);
198 slp = slpDust (spp, NULL, bsp->id, &vnp, reg, nreg, loopDustMax);
199
200 /* clean up memory */
201 SeqPortFree (spp);
202 while (reg)
203 {
204 regold = reg;
205 reg = reg->next;
206 MemFree (regold);
207 }
208
209 return slp;
210 }
211
212 /* sequence location pointers */
213
214 SeqLocPtr SeqLocDust (SeqLocPtr this_slp,
215 Int2 level, Int2 window, Int2 minwin, Int2 linker)
216 {
217 SeqLocPtr next_slp, slp = NULL;
218 ValNodePtr vnp = NULL;
219
220 SeqIdPtr id;
221 BioseqPtr bsp;
222 SeqPortPtr spp;
223
224 DREGION PNTR reg, PNTR regold;
225 Int4 nreg;
226 Int4 start, end, l;
227 Int2 loopDustMax = 0;
228
229 /* error msg stuff */
230 ErrSetOptFlags (EO_MSG_CODES);
231
232 if (!this_slp)
233 {
234 ErrPostEx (SEV_ERROR, 2, 1,
235 "no sequence location given for dusting");
236 ErrShow ();
237 return slp;
238 }
239
240 /* place for dusted regions */
241 regold = reg = MemNew (sizeof (DREGION));
242 if (!reg)
243 {
244 ErrPostEx (SEV_FATAL, 2, 2,
245 "memory allocation error");
246 ErrShow ();
247 return slp;
248 }
249 reg->from = 0;
250 reg->to = 0;
251 reg->next = NULL;
252
253 /* count seqlocs */
254 next_slp = NULL;
255 while ((next_slp = SeqLocFindNext (this_slp, next_slp)) != NULL)
256 loopDustMax++;
257 if (!loopDustMax)
258 {
259 ErrPostEx (SEV_ERROR, 2, 3,
260 "can not find next seq loc");
261 ErrShow ();
262 }
263
264 /* loop for dusting as needed */
265 next_slp = NULL;
266 while ((next_slp = SeqLocFindNext (this_slp, next_slp)) != NULL)
267 {
268 /* offsets into actual sequence */
269 start = SeqLocStart (next_slp);
270 end = SeqLocStop (next_slp);
271
272 /* if all goes okay should get a seqport pointer */
273 id = SeqLocId (next_slp);
274 if (!id)
275 {
276 ErrPostEx (SEV_ERROR, 2, 4,
277 "no bioseq id");
278 ErrShow ();
279 continue;
280 }
281 bsp = BioseqLockById (id);
282 if (!bsp)
283 {
284 ErrPostEx (SEV_ERROR, 2, 5,
285 "no bioseq");
286 ErrShow ();
287 continue;
288 }
289 if (!ISA_na (bsp->mol))
290 {
291 ErrPostEx (SEV_WARNING, 2, 6,
292 "not nucleic acid");
293 ErrShow ();
294 BioseqUnlock (bsp);
295 continue;
296 }
297 spp = SeqPortNew (bsp, start, end, 0, Seq_code_ncbi2na);
298 BioseqUnlock (bsp);
299 if (!spp)
300 {
301 ErrPostEx (SEV_ERROR, 2, 7,
302 "sequence port open failed");
303 ErrShow ();
304 continue;
305 }
306
307 l = spp->totlen;
308 nreg = dust_segs (l, spp, start, reg,
309 (Int4)level, (Int4)window, (Int4)minwin, (Int4)linker);
310 slp = slpDust (spp, slp, id, &vnp,
311 reg, nreg, loopDustMax);
312 /* find tail - this way avoids referencing the pointer */
313 while (reg->next) reg = reg->next;
314
315 SeqPortFree (spp);
316 }
317
318 /* clean up memory */
319 reg = regold;
320 while (reg)
321 {
322 regold = reg;
323 reg = reg->next;
324 MemFree (regold);
325 }
326
327 return slp;
328 }
329
330 /* entry point for dusting - from BioseqDust or SeqLocDust */
331
332 static Int4 dust_segs (Int4 length, SeqPortPtr spp, Int4 start,
333 DREGION PNTR reg,
334 Int4 level, Int4 windowsize, Int4 minwin, Int4 linker)
335 {
336 Int4 len;
337 Int4 i, invrescount;
338 Int4 retlen;
339 UcharPtr seq;
340 DREGION PNTR regold = NULL;
341 DCURLOC cloc;
342 Int4 nreg, windowhalf;
343
344 /* defaults are more-or-less in keeping with original dust */
345 if (level < 2 || level > 64) level = 20;
346 if (windowsize < 8 || windowsize > 64) windowsize = 64;
347 if (minwin < 4 || minwin > 128) minwin = 4;
348 if (linker < 1 || linker > 32) linker = 1;
349 windowhalf = windowsize / 2;
350
351 /* keep track of weird invalid residues */
352 invrescount = 0;
353
354 nreg = 0;
355 seq = MemNew (windowsize); /* triplets */
356 if (!seq)
357 {
358 ErrPostEx (SEV_FATAL, 4, 1,
359 "failed to allocate triplet buffer");
360 ErrShow ();
361 return nreg;
362 }
363
364 for (i = 0; i < length; i += windowhalf)
365 {
366 len = (Int4) ((length > i+windowsize) ? windowsize : length-i);
367 len -= 2;
368 retlen = wo (len, spp, i, &cloc, &invrescount, seq);
369
370 /* get rid of itsy-bitsy's, dusttripfind aborts - move 1 triplet away */
371 if ((cloc.curend - cloc.curstart + 1) < minwin)
372 {
373 if (retlen != len)
374 {
375 i += (retlen - windowhalf + 3);
376 }
377 continue;
378 }
379
380 if (cloc.curlevel > level)
381 {
382 if (nreg &&
383 regold->to + linker >= cloc.curstart+i+start &&
384 regold->from <= cloc.curstart + i + start)
385 {
386 /* overlap windows nicely if needed */
387 regold->to = cloc.curend + i + start;
388 }
389 else
390 {
391 /* new window or dusted regions do not overlap */
392 reg->from = cloc.curstart + i + start;
393 reg->to = cloc.curend + i + start;
394 /* 5' edge effects - 3' edge effects - are best handled interactively */
395 /* it prbly would be good to put 'linker' as an interactive option */
396 /* interactive means wrapping stuff up in a graphics shell */
397 regold = reg;
398 reg = MemNew (sizeof (DREGION));
399 if (!reg)
400 {
401 ErrPostEx (SEV_FATAL, 3, 1,
402 "memory allocation error");
403 ErrShow ();
404 MemFree(seq);
405 return nreg;
406 }
407 reg->next = NULL;
408 regold->next = reg;
409 nreg++;
410 }
411 /* kill virtually all 3' tiling anomalies */
412 if (cloc.curend < windowhalf)
413 {
414 i += (cloc.curend - windowhalf);
415 }
416 } /* end 'if' high score */
417 } /* end for */
418 MemFree (seq);
419 if (invrescount > 0)
420 {
421 ErrPostEx (SEV_INFO, 3, 2,
422 "Total invalid residues found: %ld", (long) invrescount);
423 ErrShow ();
424 }
425 return nreg;
426 }
427
428 static Int4 wo (Int4 len, SeqPortPtr spp, Int4 iseg, DCURLOC PNTR cloc,
429 Int4 PNTR invrescount, UcharPtr seq)
430 {
431 Int4 i, flen;
432
433 cloc->curlevel = 0;
434 cloc->curstart = 0;
435 cloc->curend = 0;
436
437 /* get the chunk of sequence in triplets */
438
439 SeqPortSeek (spp, iseg, SEEK_SET);
440 MemSet (seq,0,len+2); /* Zero the triplet buffer */
441 flen = dusttripfind (spp, seq, iseg, len, invrescount);
442
443 /* dust the chunk */
444 for (i = 0; i < flen; i++)
445 {
446 wo1 (flen-i, seq+i, i, cloc);
447 }
448
449 cloc->curend += cloc->curstart;
450
451 return flen;
452 }
453
454 static void wo1 (Int4 len, UcharPtr seq, Int4 iwo, DCURLOC PNTR cloc)
455 {
456 Uint4 sum;
457 Int4 loop;
458 Int4 newlevel;
459
460 Int2 PNTR countsptr;
461 Int2 counts[4*4*4];
462 MemSet (counts, 0, sizeof (counts));
463 /* zero everything */
464 sum = 0;
465 newlevel = 0;
466
467 /* dust loop -- specific for triplets */
468 for (loop = 0; loop < len; loop++)
469 {
470 countsptr = &counts[*seq++];
471 if (*countsptr)
472 {
473 sum += (Uint8)(*countsptr);
474
475 newlevel = 10 * sum / loop;
476
477 if (cloc->curlevel < newlevel)
478 {
479 cloc->curlevel = newlevel;
480 cloc->curstart = iwo;
481 cloc->curend = loop + 2; /* triplets */
482 }
483 }
484 (*countsptr)++;
485 }
486 return;
487 }
488
489 /* fill an array with 2-bit encoded triplets */
490
491 static Int4 dusttripfind (SeqPortPtr spp, UcharPtr s1, Int4 icur, Int4 max,
492 Int4 PNTR invrescount)
493 {
494 Int4 pos;
495 Int4 n;
496 UcharPtr s2, s3;
497 Int2 c;
498 Boolean flagVD;
499
500 n = 0;
501
502 s2 = s1 + 1;
503 s3 = s1 + 2;
504
505 SeqPortSeek (spp, icur, SEEK_SET);
506
507 /* set up needs streamlining */
508 /* start again at segment or virtual sequence bounderies */
509 /* set up 1 */
510 if ((c = SeqPortGetResidue (spp)) == SEQPORT_EOF)
511 return n;
512 if (c == SEQPORT_EOS || c == SEQPORT_VIRT)
513 return n;
514 if (!IS_residue (c))
515 {
516 c = 0; /* 255 it's 'A' */
517 if (*invrescount < 3)
518 {
519 pos = SeqPortTell (spp);
520 ErrPostEx (SEV_INFO, 5, 1,
521 "Invalid residue converted to 'A': %ld", (long) pos);
522 ErrShow ();
523 }
524 (*invrescount)++;
525 }
526 *s1 |= c;
527 *s1 <<= 2;
528
529 /* set up 2 */
530 if ((c = SeqPortGetResidue (spp)) == SEQPORT_EOF)
531 return n;
532 if (c == SEQPORT_EOS || c == SEQPORT_VIRT)
533 return n;
534 if (!IS_residue (c))
535 {
536 c = 0; /* 255 it's 'A' */
537 if (*invrescount < 3)
538 {
539 pos = SeqPortTell (spp);
540 ErrPostEx (SEV_INFO, 5, 1,
541 "Invalid residue converted to 'A': %ld", (long) pos);
542 ErrShow ();
543 }
544 (*invrescount)++;
545 }
546 *s1 |= c;
547 *s2 |= c;
548
549 /* triplet fill loop */
550 flagVD = TRUE;
551 while ((c = SeqPortGetResidue (spp)) != SEQPORT_EOF && n < max)
552 {
553 if (c == INVALID_RESIDUE)
554 {
555 c = 0; /* 255 it's 'A' */
556 if (*invrescount < 3)
557 {
558 pos = SeqPortTell (spp);
559 ErrPostEx (SEV_INFO, 5, 1,
560 "Invalid residue converted to 'A': %ld", (long) pos);
561 ErrShow ();
562 }
563 (*invrescount)++;
564 }
565 if (IS_residue (c))
566 {
567 *s1 <<= 2;
568 *s2 <<= 2;
569 *s1 |= c;
570 *s2 |= c;
571 *s3 |= c;
572 s1++;
573 s2++;
574 s3++;
575 n++;
576 }
577 else
578 {
579 switch (c)
580 {
581 case SEQPORT_EOS: /* 252 rare */
582 break;
583 /* VIRT if there is an undetermined segment of sequence */
584 case SEQPORT_VIRT: /* 251 ignore ? */
585 default:
586 /* flagVD = TRUE; dust across v-seg */
587 flagVD = FALSE; /* don't dust across */
588 break;
589 }
590 if (!flagVD) break;
591 }
592 } /* end while */
593
594 return n;
595 }
596
597 /* look for dustable locations */
598
599 static SeqLocPtr slpDust (SeqPortPtr spp, SeqLocPtr slp, SeqIdPtr id,
600 ValNodePtr PNTR vnp, DREGION PNTR reg,
601 Int4 nreg, Int4 loopDustMax)
602 {
603 SeqIntPtr sintp;
604 Int4 i;
605 Boolean flagNoPack;
606
607 /* point to dusted locations */
608 if (nreg)
609 {
610
611 /* loopDustMax == 1 forces PACKED_INT IN - PACKED_INT OUT as needed */
612 flagNoPack = FALSE;
613 if (nreg == 1 && loopDustMax == 1) flagNoPack = TRUE;
614
615 if (!slp)
616 {
617 if ((slp = ValNodeNew (NULL)) == NULL)
618 {
619 ErrPostEx (SEV_ERROR, 6, 1,
620 "val node new failed");
621 ErrShow ();
622 return slp;
623 }
624 }
625
626 if (flagNoPack)
627 {
628 slp->choice = SEQLOC_INT;
629 }
630 else
631 {
632 slp->choice = SEQLOC_PACKED_INT;
633 }
634
635 for (i = 0; i < nreg; i++)
636 {
637 sintp = SeqIntNew ();
638 if (!sintp)
639 {
640 ErrPostEx (SEV_FATAL, 6, 2,
641 "memory allocation error");
642 ErrShow ();
643 return slp;
644 }
645 sintp->id = SeqIdDup (id);
646 sintp->from = reg->from;
647 sintp->to = reg->to;
648 if (!flagNoPack) ValNodeAddPointer
649 (vnp, SEQLOC_INT, sintp);
650 reg = reg->next;
651 }
652
653 if (flagNoPack)
654 {
655 slp->data.ptrvalue = (Pointer) sintp;
656 }
657 else
658 {
659 slp->data.ptrvalue = *vnp;
660 }
661 }
662 return slp;
663 }
664
665 extern FloatHiPtr DustGraph (SeqPortPtr spp, Int4 length, Int2 level,
666 Int2 window, Int2 minwin, Int2 linker)
667 {
668 DCURLOC cloc;
669 Int2 windowhalf;
670 Int2 retlen;
671 Int4 len;
672 Int4 i, n;
673 Int4 invrescount;
674 FloatHi maxval, curval;
675 FloatHiPtr fhead, ftemp;
676 FloatHiPtr fptr;
677 UcharPtr seq;
678
679 invrescount = 0;
680 windowhalf = window / 2;
681
682 maxval = level * 5.0;
683 if (maxval > 100.0)
684 maxval = 100.0;
685
686 fptr = (FloatHiPtr) MemNew ((sizeof (FloatHi)) * length);
687 fhead = fptr;
688 for (i = 0; i < length; i++)
689 *fptr++ = 0.0;
690
691 fptr = fhead;
692 seq = MemNew ((size_t)window); /* triplets */
693 if (!seq)
694 {
695 ErrPostEx (SEV_FATAL, 4, 1,
696 "failed to allocate triplet buffer");
697 ErrShow ();
698 return 0;
699 }
700
701 for (i = 0; i < length; i += windowhalf)
702 {
703 len = (length > i+window) ? window : (Int2) (length-i);
704 len -= 2;
705 retlen = wo (len, spp, i, &cloc, &invrescount, seq);
706
707 if ((cloc.curend - cloc.curstart + 1) < minwin)
708 {
709 if (retlen != len)
710 i += (retlen - windowhalf + 3);
711 continue;
712 }
713 curval = (FloatHi) cloc.curlevel;
714 if (curval > maxval)
715 curval = maxval;
716 fptr = fhead + cloc.curstart + i;
717 for (n = cloc.curstart; n < cloc.curend+1; n++)
718 *fptr++ = curval;
719 }
720 MemFree(seq);
721 fptr = fhead;
722 for (i = 0; i < length-((Int4)linker); i++)
723 {
724 if (*fptr != 0)
725 {
726 ftemp = fptr + linker;
727 if (*ftemp != 0)
728 {
729 curval = (*ftemp + *fptr) / 2;
730 fptr++;
731 for (n = 1; n < linker; n++)
732 *fptr++ = curval;
733 i += (linker - 1);
734 }
735 }
736 else
737 {
738 fptr++;
739 }
740 }
741
742 return fhead;
743 }
744 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |