|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/tools/blast_dust.c |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: blast_dust.c,v 1.40 2007/01/23 15:25:22 madden Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ==========================================================================
25 *
26 * Authors: Richa Agarwala (based upon versions variously worked upon by Roma
27 * Tatusov, John Kuzio, and Ilya Dondoshansky).
28 *
29 * ==========================================================================
30 */
31
32 /** @file blast_dust.c
33 * A utility to find low complexity NA regions. This parallels functionality
34 * of dust.c from the C toolkit, but without using the structures generated
35 * from ASN.1 spec.
36 */
37
38 #ifndef SKIP_DOXYGEN_PROCESSING
39 static char const rcsid[] =
40 "$Id: blast_dust.c,v 1.40 2007/01/23 15:25:22 madden Exp $";
41 #endif /* SKIP_DOXYGEN_PROCESSING */
42
43 #include <sequtil.h>
44 #include <objmgr.h>
45 #include <seqport.h>
46 #include <blastkar.h>
47 #include <blast_dust.h>
48
49
50
51
52 /* local, file scope, structures and variables */
53
54 /** localcurrents
55 * @todo expand documentation
56 */
57 typedef struct DCURLOC {
58 Int4 cursum, curstart, curend;
59 Int2 curlength;
60 } DCURLOC;
61
62 Uint1 *SUM_THRESHOLD;
63
64 /* local functions */
65
66 static void wo (Int4, Uint1*, Int4, DCURLOC*, Uint1*, Uint1, Int4);
67 static Boolean wo1 (Int4, Uint1*, Int4, DCURLOC*);
68 static Int4 dust_triplet_find (Uint1*, Int4, Int4, Uint1*);
69 static void s_GetSequence(SeqPortPtr spp, Uint1* buf, Int4 buf_length);
70 static SeqLocPtr s_SlpDust (SeqLocPtr slp, SeqIdPtr id,
71 DREGION *reg, Int4 nreg, Int4 loopDustMax);
72
73
74 SeqLocPtr BioseqDustEx (BioseqPtr bsp, Int4 level, Int4 windowsize, Int4 linker)
75 {
76 SeqLocPtr slp = NULL; /* initialize */
77
78 SeqPortPtr spp;
79
80 DREGION PNTR reg, PNTR regold;
81 Int4 nreg;
82 Int4 loopDustMax = 1;
83
84 Uint1* buf = NULL;
85
86 /* error msg stuff */
87 /* ErrSetOptFlags (EO_MSG_CODES | EO_SHOW_FILELINE | EO_BEEP); */
88 ErrSetOptFlags (EO_MSG_CODES);
89
90 /* make sure bioseq is there */
91 if (!bsp)
92 {
93 ErrPostEx (SEV_ERROR, 1, 1,
94 "no bioseq");
95 ErrShow ();
96 return slp;
97 }
98 if (!ISA_na (bsp->mol))
99 {
100 ErrPostEx (SEV_WARNING, 1, 2,
101 "not nucleic acid");
102 ErrShow ();
103 return slp;
104 }
105
106 /* place for dusted regions */
107 reg = MemNew (sizeof (DREGION));
108 if (!reg)
109 {
110 ErrPostEx (SEV_FATAL, 1, 3,
111 "memory allocation error");
112 ErrShow ();
113 return slp;
114 }
115 reg->from = 0;
116 reg->to = 0;
117 reg->next = NULL;
118
119 /* do it */
120 spp = SeqPortNew (bsp, 0, bsp->length, 0, Seq_code_ncbi2na);
121 if (!spp)
122 {
123 ErrPostEx (SEV_ERROR, 1, 4,
124 "sequence port open failed");
125 ErrShow ();
126 MemFree (reg);
127 return slp;
128 }
129 buf = MemNew((1+bsp->length)*sizeof(Uint1));
130 s_GetSequence(spp, buf, (1+bsp->length));
131
132 nreg = DustSegs (buf, bsp->length, 0, reg, level, windowsize, linker);
133 slp = s_SlpDust (NULL, bsp->id, reg, nreg, loopDustMax);
134
135 /* clean up memory */
136 SeqPortFree (spp);
137 MemFree(buf);
138 while (reg)
139 {
140 regold = reg;
141 reg = reg->next;
142 MemFree (regold);
143 }
144
145 return slp;
146 }
147
148 /* sequence location pointers */
149
150 SeqLocPtr SeqLocDustEx (SeqLocPtr this_slp,
151 Int4 level, Int4 window, Int4 linker)
152 {
153 SeqLocPtr next_slp, slp = NULL;
154
155 SeqIdPtr id;
156 BioseqPtr bsp;
157 SeqPortPtr spp;
158
159 DREGION PNTR reg, PNTR regold;
160 Int4 nreg;
161 Int4 start, end, length;
162 Int2 loopDustMax = 0;
163
164 /* error msg stuff */
165 ErrSetOptFlags (EO_MSG_CODES);
166
167 if (!this_slp)
168 {
169 ErrPostEx (SEV_ERROR, 2, 1,
170 "no sequence location given for dusting");
171 ErrShow ();
172 return slp;
173 }
174
175 /* place for dusted regions */
176 regold = reg = MemNew (sizeof (DREGION));
177 if (!reg)
178 {
179 ErrPostEx (SEV_FATAL, 2, 2,
180 "memory allocation error");
181 ErrShow ();
182 return slp;
183 }
184 reg->from = 0;
185 reg->to = 0;
186 reg->next = NULL;
187
188 /* count seqlocs */
189 next_slp = NULL;
190 while ((next_slp = SeqLocFindNext (this_slp, next_slp)) != NULL)
191 loopDustMax++;
192 if (!loopDustMax)
193 {
194 ErrPostEx (SEV_ERROR, 2, 3,
195 "can not find next seq loc");
196 ErrShow ();
197 }
198
199 /* loop for dusting as needed */
200 next_slp = NULL;
201 while ((next_slp = SeqLocFindNext (this_slp, next_slp)) != NULL)
202 {
203 Uint1* buf = NULL;
204 /* offsets into actual sequence */
205 start = SeqLocStart (next_slp);
206 end = SeqLocStop (next_slp);
207
208 /* if all goes okay should get a seqport pointer */
209 id = SeqLocId (next_slp);
210 if (!id)
211 {
212 ErrPostEx (SEV_ERROR, 2, 4,
213 "no bioseq id");
214 ErrShow ();
215 continue;
216 }
217 bsp = BioseqLockById (id);
218 if (!bsp)
219 {
220 ErrPostEx (SEV_ERROR, 2, 5,
221 "no bioseq");
222 ErrShow ();
223 continue;
224 }
225 if (!ISA_na (bsp->mol))
226 {
227 ErrPostEx (SEV_WARNING, 2, 6,
228 "not nucleic acid");
229 ErrShow ();
230 BioseqUnlock (bsp);
231 continue;
232 }
233 spp = SeqPortNew (bsp, start, end, 0, Seq_code_ncbi2na);
234 BioseqUnlock (bsp);
235 if (!spp)
236 {
237 ErrPostEx (SEV_ERROR, 2, 7,
238 "sequence port open failed");
239 ErrShow ();
240 continue;
241 }
242 length = spp->totlen;
243 buf = MemNew((1+length)*sizeof(Uint1));
244 s_GetSequence(spp, buf, (1+length));
245
246 nreg = DustSegs (buf, length, start, reg,
247 level, window, linker);
248 buf = MemFree(buf);
249 SeqPortFree (spp);
250 slp = s_SlpDust (slp, id, reg, nreg, loopDustMax);
251 /* find tail - this way avoids referencing the pointer */
252 while (reg->next) reg = reg->next;
253
254 }
255
256 /* clean up memory */
257 reg = regold;
258 while (reg)
259 {
260 regold = reg;
261 reg = reg->next;
262 MemFree (regold);
263 }
264
265 return slp;
266 }
267
268
269 static void
270 s_GetSequence(SeqPortPtr spp, Uint1* buf, Int4 buf_length)
271 {
272 Uint1 residue;
273 int index=0;
274
275 ASSERT(spp && buf && buf_length > 0);
276
277 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
278 {
279 if (residue == SEQPORT_VIRT)
280 continue;
281 buf[index] = residue;
282 index++;
283 }
284 ASSERT(index < buf_length);
285 buf[index] = NULLB;
286 return;
287 }
288
289
290 /* Produce locations from dust results. */
291
292 static SeqLocPtr s_SlpDust (SeqLocPtr slp, SeqIdPtr id,
293 DREGION *reg, Int4 nreg, Int4 loopDustMax)
294 {
295 SeqIntPtr sintp;
296 Int4 i;
297 Boolean flagNoPack;
298 ValNodePtr vnp = NULL;
299
300 /* point to dusted locations */
301 if (nreg)
302 {
303
304 /* loopDustMax == 1 forces PACKED_INT IN - PACKED_INT OUT as needed */
305 flagNoPack = FALSE;
306 if (nreg == 1 && loopDustMax == 1) flagNoPack = TRUE;
307
308 if (!slp)
309 {
310 if ((slp = ValNodeNew (NULL)) == NULL)
311 {
312 ErrPostEx (SEV_ERROR, 6, 1,
313 "val node new failed");
314 ErrShow ();
315 return slp;
316 }
317 }
318
319 if (flagNoPack)
320 {
321 slp->choice = SEQLOC_INT;
322 }
323 else
324 {
325 slp->choice = SEQLOC_PACKED_INT;
326 }
327
328 for (i = 0; i < nreg; i++)
329 {
330 sintp = SeqIntNew ();
331 if (!sintp)
332 {
333 ErrPostEx (SEV_FATAL, 6, 2,
334 "memory allocation error");
335 ErrShow ();
336 return slp;
337 }
338 sintp->id = SeqIdDup (id);
339 sintp->from = reg->from;
340 sintp->to = reg->to;
341 if (!flagNoPack) ValNodeAddPointer
342 (&vnp, SEQLOC_INT, sintp);
343 reg = reg->next;
344 }
345
346 if (flagNoPack)
347 {
348 slp->data.ptrvalue = (Pointer) sintp;
349 }
350 else
351 {
352 slp->data.ptrvalue = vnp;
353 }
354 }
355 return slp;
356 }
357
358 /* entry point for dusting */
359
360 Int4 DustSegs (Uint1* sequence, Int4 length, Int4 start,
361 DREGION* reg,
362 Int4 level, Int4 windowsize, Int4 linker)
363 {
364 Int4 len;
365 Int4 i;
366 Uint1* seq;
367 DREGION* regold = NULL;
368 DCURLOC cloc;
369 Int4 nreg;
370 /* Default values. */
371 const int kDustLevel = 20;
372 const int kDustWindow = 64;
373 const int kDustLinker = 1;
374
375 /* defaults are more-or-less in keeping with original dust */
376 if (level < 2 || level > 64) level = kDustLevel;
377 if (windowsize < 8 || windowsize > 64) windowsize = kDustWindow;
378 if (linker < 1 || linker > 32) linker = kDustLinker;
379
380 nreg = 0;
381 seq = (Uint1*) calloc(1, windowsize); /* triplets */
382 if (!seq) {
383 return -1;
384 }
385 SUM_THRESHOLD = (Uint1 *) calloc(windowsize, sizeof(Uint1));
386 if (!SUM_THRESHOLD) {
387 return -1;
388 }
389 SUM_THRESHOLD[0] = 1;
390 for (i=1; i < windowsize; i++)
391 SUM_THRESHOLD[i] = (level * i)/10;
392
393 if (length < windowsize) windowsize = length;
394
395 /* Consider smaller windows in beginning of the sequence */
396 for (i = 2; i <= windowsize-1; i++) {
397 len = i-1;
398 wo (len, sequence, 0, &cloc, seq, 1, level);
399
400 if (cloc.cursum*10 > level*cloc.curlength) {
401 if (nreg &&
402 regold->to + linker >= cloc.curstart+start &&
403 regold->from <= cloc.curend + start + linker) {
404 /* overlap windows nicely if needed */
405 if (regold->to < cloc.curend + start)
406 regold->to = cloc.curend + start;
407 if (regold->from > cloc.curstart + start)
408 regold->from = cloc.curstart + start;
409 } else {
410 /* new window or dusted regions do not overlap */
411 reg->from = cloc.curstart + start;
412 reg->to = cloc.curend + start;
413 regold = reg;
414 reg = (DREGION*) calloc(1, sizeof(DREGION));
415 if (!reg) {
416 MemFree(seq);
417 MemFree(SUM_THRESHOLD);
418 return -1;
419 }
420 reg->next = NULL;
421 regold->next = reg;
422 nreg++;
423 }
424 } /* end 'if' high score */
425 } /* end for */
426
427 for (i = 1; i < length-2; i++) {
428 len = (Int4) ((length > i+windowsize) ? windowsize : length-i);
429 len -= 2;
430 if (length >= i+windowsize)
431 wo (len, sequence, i, &cloc, seq, 2, level);
432 else /* remaining portion of sequence is less than windowsize */
433 wo (len, sequence, i, &cloc, seq, 3, level);
434
435 if (cloc.cursum*10 > level*cloc.curlength) {
436 if (nreg &&
437 regold->to + linker >= cloc.curstart+i+start &&
438 regold->from <= cloc.curend + i + start + linker) {
439 /* overlap windows nicely if needed */
440 if (regold->to < cloc.curend + i + start)
441 regold->to = cloc.curend + i + start;
442 if (regold->from > cloc.curstart + i + start)
443 regold->from = cloc.curstart + i + start;
444 } else {
445 /* new window or dusted regions do not overlap */
446 reg->from = cloc.curstart + i + start;
447 reg->to = cloc.curend + i + start;
448 regold = reg;
449 reg = (DREGION*) calloc(1, sizeof(DREGION));
450 if (!reg) {
451 MemFree(seq);
452 MemFree(SUM_THRESHOLD);
453 return -1;
454 }
455 reg->next = NULL;
456 regold->next = reg;
457 nreg++;
458 }
459 } /* end 'if' high score */
460 } /* end for */
461 MemFree (seq);
462 MemFree(SUM_THRESHOLD);
463 return nreg;
464 }
465
466 static void wo (Int4 len, Uint1* seq_start, Int4 iseg, DCURLOC* cloc,
467 Uint1* seq, Uint1 FIND_TRIPLET, Int4 level)
468 {
469 Int4 smaller_window_start, mask_window_end;
470 Boolean SINGLE_TRIPLET;
471
472 cloc->cursum = 0;
473 cloc->curlength = 1;
474 cloc->curstart = 0;
475 cloc->curend = 0;
476
477 if (len < 1)
478 return;
479
480 /* get the chunk of sequence in triplets */
481 if (FIND_TRIPLET==1) /* Append one */
482 {
483 seq[len-1] = seq[len] = seq[len+1] = 0;
484 dust_triplet_find (seq_start, iseg+len-1, 1, seq+len-1);
485 }
486 if (FIND_TRIPLET==2) /* Copy suffix as prefix and find one */
487 {
488 memmove(seq,seq+1,(len-1)*sizeof(Uint1));
489 seq[len-1] = seq[len] = seq[len+1] = 0;
490 dust_triplet_find (seq_start, iseg+len-1, 1, seq+len-1);
491 }
492 if (FIND_TRIPLET==3) /* Copy suffix */
493 memmove(seq,seq+1,len*sizeof(Uint1));
494
495 /* dust the chunk */
496 SINGLE_TRIPLET = wo1 (len, seq, 0, cloc); /* dust at start of window */
497
498 /* consider smaller windows only if anything interesting
499 found for starting position and smaller windows have potential of
500 being at higher level */
501 if ((cloc->cursum*10 > level*cloc->curlength) && (!SINGLE_TRIPLET)) {
502 mask_window_end = cloc->curend-1;
503 smaller_window_start = 1;
504 while ((smaller_window_start < mask_window_end) &&
505 (!SINGLE_TRIPLET)) {
506 SINGLE_TRIPLET = wo1(mask_window_end-smaller_window_start,
507 seq+smaller_window_start, smaller_window_start, cloc);
508 smaller_window_start++;
509 }
510 }
511
512 cloc->curend += cloc->curstart;
513 }
514
515 /** returns TRUE if there is single triplet in the sequence considered */
516 static Boolean wo1 (Int4 len, Uint1* seq, Int4 iwo, DCURLOC* cloc)
517 {
518 Uint4 sum;
519 Int4 loop;
520
521 Int2* countsptr;
522 Int2 counts[4*4*4];
523 Uint1 triplet_count = 0;
524
525 memset (counts, 0, sizeof (counts));
526 /* zero everything */
527 sum = 0;
528
529 /* dust loop -- specific for triplets */
530 for (loop = 0; loop < len; loop++)
531 {
532 countsptr = &counts[*seq++];
533 if (*countsptr)
534 {
535 sum += (Uint4)(*countsptr);
536
537 if (sum >= SUM_THRESHOLD[loop])
538 {
539 if ((Uint4)cloc->cursum*loop < sum*cloc->curlength)
540 {
541 cloc->cursum = sum;
542 cloc->curlength = loop;
543 cloc->curstart = iwo;
544 cloc->curend = loop + 2; /* triplets */
545 }
546 }
547 }
548 else
549 triplet_count++;
550 (*countsptr)++;
551 }
552
553 if (triplet_count > 1)
554 return(FALSE);
555 return(TRUE);
556 }
557
558 /** Fill an array with 2-bit encoded triplets.
559 * @param seq_start Pointer to the start of the sequence in blastna
560 * encoding [in]
561 * @param icur Offset at which to start extracting triplets [in]
562 * @param max Maximal length of the sequence segment to be processed [in]
563 * @param s1 Array of triplets [out]
564 * @return How far was the sequence processed?
565 */
566 static Int4
567 dust_triplet_find (Uint1* seq_start, Int4 icur, Int4 max, Uint1* s1)
568 {
569 Int4 n;
570 Uint1* s2,* s3;
571 Int2 c;
572 Uint1* seq = &seq_start[icur];
573 Uint1 end_byte = ncbi4na_to_blastna[NULLB];
574 const int k_NCBI2na_mask = 0x03;
575
576 n = 0;
577
578 s2 = s1 + 1;
579 s3 = s1 + 2;
580
581 /* set up 1 */
582 if ((c = *seq++) == end_byte)
583 return n;
584 c &= k_NCBI2na_mask;
585 *s1 |= c;
586 *s1 <<= 2;
587
588 /* set up 2 */
589 if ((c = *seq++) == end_byte)
590 return n;
591 c &= k_NCBI2na_mask;
592 *s1 |= c;
593 *s2 |= c;
594
595 /* triplet fill loop */
596 while (n < max && (c = *seq++) != end_byte) {
597 c &= k_NCBI2na_mask;
598 *s1 <<= 2;
599 *s2 <<= 2;
600 *s1 |= c;
601 *s2 |= c;
602 *s3 |= c;
603 s1++;
604 s2++;
605 s3++;
606 n++;
607 }
608
609 return n;
610 }
611 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |