NCBI C Toolkit Cross Reference

C/tools/blast_dust.c


  1 /* $Id: blast_dust.c,v 1.40 2007/01/23 15:25:22 madden Exp $
  2  * ===========================================================================
  3  *
  4  *                            PUBLIC DOMAIN NOTICE
  5  *               National Center for Biotechnology Information
  6  *
  7  *  This software/database is a "United States Government Work" under the
  8  *  terms of the United States Copyright Act.  It was written as part of
  9  *  the author's official duties as a United States Government employee and
 10  *  thus cannot be copyrighted.  This software/database is freely available
 11  *  to the public for use. The National Library of Medicine and the U.S.
 12  *  Government have not placed any restriction on its use or reproduction.
 13  *
 14  *  Although all reasonable efforts have been taken to ensure the accuracy
 15  *  and reliability of the software and data, the NLM and the U.S.
 16  *  Government do not and cannot warrant the performance or results that
 17  *  may be obtained by using this software or data. The NLM and the U.S.
 18  *  Government disclaim all warranties, express or implied, including
 19  *  warranties of performance, merchantability or fitness for any particular
 20  *  purpose.
 21  *
 22  *  Please cite the author in any work or product based on this material.
 23  *
 24  * ==========================================================================
 25  *
 26  * Authors: Richa Agarwala (based upon versions variously worked upon by Roma 
 27  *          Tatusov, John Kuzio, and Ilya Dondoshansky).
 28  *   
 29  * ==========================================================================
 30  */
 31 
 32 /** @file blast_dust.c
 33  * A utility to find low complexity NA regions. This parallels functionality 
 34  * of dust.c from the C toolkit, but without using the structures generated 
 35  * from ASN.1 spec.
 36  */
 37 
 38 #ifndef SKIP_DOXYGEN_PROCESSING
 39 static char const rcsid[] =
 40     "$Id: blast_dust.c,v 1.40 2007/01/23 15:25:22 madden Exp $";
 41 #endif /* SKIP_DOXYGEN_PROCESSING */
 42 
 43 #include <sequtil.h>
 44 #include <objmgr.h>
 45 #include <seqport.h>
 46 #include <blastkar.h>
 47 #include <blast_dust.h>
 48 
 49 
 50 
 51 
 52 /* local, file scope, structures and variables */
 53 
 54 /** localcurrents 
 55  * @todo expand documentation
 56  */
 57 typedef struct DCURLOC { 
 58         Int4    cursum, curstart, curend;
 59         Int2    curlength;
 60 } DCURLOC;
 61 
 62 Uint1 *SUM_THRESHOLD;
 63 
 64 /* local functions */
 65 
 66 static void wo (Int4, Uint1*, Int4, DCURLOC*, Uint1*, Uint1, Int4);
 67 static Boolean wo1 (Int4, Uint1*, Int4, DCURLOC*);
 68 static Int4 dust_triplet_find (Uint1*, Int4, Int4, Uint1*);
 69 static void s_GetSequence(SeqPortPtr spp, Uint1* buf, Int4 buf_length);
 70 static SeqLocPtr s_SlpDust (SeqLocPtr slp, SeqIdPtr id, 
 71                      DREGION *reg, Int4 nreg, Int4 loopDustMax);
 72 
 73 
 74 SeqLocPtr BioseqDustEx (BioseqPtr bsp, Int4 level, Int4 windowsize, Int4 linker)
 75 {
 76         SeqLocPtr       slp = NULL;     /* initialize */
 77 
 78         SeqPortPtr      spp;
 79 
 80         DREGION PNTR reg, PNTR regold;
 81         Int4    nreg;
 82         Int4 loopDustMax = 1;
 83 
 84         Uint1* buf = NULL;
 85 
 86 /* error msg stuff */
 87 /*      ErrSetOptFlags (EO_MSG_CODES | EO_SHOW_FILELINE | EO_BEEP); */
 88         ErrSetOptFlags (EO_MSG_CODES);
 89 
 90 /* make sure bioseq is there */
 91         if (!bsp)
 92         {
 93                 ErrPostEx (SEV_ERROR, 1, 1,
 94                           "no bioseq");
 95                 ErrShow ();
 96                 return slp;
 97         }
 98         if (!ISA_na (bsp->mol))
 99         {
100                 ErrPostEx (SEV_WARNING, 1, 2,
101                           "not nucleic acid");
102                 ErrShow ();
103                 return slp;
104         }
105 
106 /* place for dusted regions */
107         reg = MemNew (sizeof (DREGION));
108         if (!reg)
109         {
110                 ErrPostEx (SEV_FATAL, 1, 3,
111                            "memory allocation error");
112                 ErrShow ();
113                 return slp;
114         }
115         reg->from = 0;
116         reg->to = 0;
117         reg->next = NULL;
118 
119 /* do it */
120         spp = SeqPortNew (bsp, 0, bsp->length, 0, Seq_code_ncbi2na);
121         if (!spp)
122         {
123                 ErrPostEx (SEV_ERROR, 1, 4,
124                            "sequence port open failed");
125                 ErrShow ();
126                 MemFree (reg);
127                 return slp;
128         }
129         buf = MemNew((1+bsp->length)*sizeof(Uint1));
130         s_GetSequence(spp, buf, (1+bsp->length));
131 
132         nreg = DustSegs (buf, bsp->length, 0, reg, level, windowsize, linker);
133         slp = s_SlpDust (NULL, bsp->id, reg, nreg, loopDustMax);
134 
135 /* clean up memory */
136         SeqPortFree (spp);
137         MemFree(buf);
138         while (reg)
139         {
140                 regold = reg;
141                 reg = reg->next;
142                 MemFree (regold);
143         }
144 
145         return slp;
146 }
147 
148 /* sequence location pointers */
149 
150 SeqLocPtr SeqLocDustEx (SeqLocPtr this_slp,
151                       Int4 level, Int4 window, Int4 linker)
152 {
153         SeqLocPtr       next_slp, slp = NULL;
154 
155         SeqIdPtr        id;
156         BioseqPtr       bsp;
157         SeqPortPtr      spp;
158 
159         DREGION PNTR reg, PNTR regold;
160         Int4 nreg;
161         Int4 start, end, length;
162         Int2 loopDustMax = 0;
163 
164 /* error msg stuff */
165         ErrSetOptFlags (EO_MSG_CODES);
166 
167         if (!this_slp)
168         {
169                 ErrPostEx (SEV_ERROR, 2, 1,
170                           "no sequence location given for dusting");
171                 ErrShow ();
172                 return slp;
173         }
174 
175 /* place for dusted regions */
176         regold = reg = MemNew (sizeof (DREGION));
177         if (!reg)
178         {
179                 ErrPostEx (SEV_FATAL, 2, 2,
180                            "memory allocation error");
181                 ErrShow ();
182                 return slp;
183         }
184         reg->from = 0;
185         reg->to = 0;
186         reg->next = NULL;
187 
188 /* count seqlocs */
189         next_slp = NULL;
190         while ((next_slp = SeqLocFindNext (this_slp, next_slp)) != NULL)
191                         loopDustMax++;
192         if (!loopDustMax)
193         {
194                 ErrPostEx (SEV_ERROR, 2, 3,
195                            "can not find next seq loc");
196                 ErrShow ();
197         }
198 
199 /* loop for dusting as needed */
200         next_slp = NULL;
201         while ((next_slp = SeqLocFindNext (this_slp, next_slp)) != NULL)
202         {
203                 Uint1* buf = NULL;
204 /* offsets into actual sequence */
205                 start = SeqLocStart (next_slp);
206                 end = SeqLocStop (next_slp);
207 
208 /* if all goes okay should get a seqport pointer */
209                 id = SeqLocId (next_slp);
210                 if (!id)
211                 {
212                         ErrPostEx (SEV_ERROR, 2, 4,
213                                   "no bioseq id");
214                         ErrShow ();
215                         continue;
216                 }
217                 bsp = BioseqLockById (id);
218                 if (!bsp)
219                 {
220                         ErrPostEx (SEV_ERROR, 2, 5,
221                                   "no bioseq");
222                         ErrShow ();
223                         continue;
224                 }
225                 if (!ISA_na (bsp->mol))
226                 {
227                         ErrPostEx (SEV_WARNING, 2, 6,
228                                   "not nucleic acid");
229                         ErrShow ();
230                         BioseqUnlock (bsp);
231                         continue;
232                 }
233                 spp = SeqPortNew (bsp, start, end, 0, Seq_code_ncbi2na);
234                 BioseqUnlock (bsp);
235                 if (!spp)
236                 {
237                         ErrPostEx (SEV_ERROR, 2, 7,
238                                    "sequence port open failed");
239                         ErrShow ();
240                         continue;
241                 }
242                 length = spp->totlen;
243                 buf = MemNew((1+length)*sizeof(Uint1));
244                 s_GetSequence(spp, buf, (1+length));
245 
246                 nreg = DustSegs (buf, length, start, reg,
247                                   level, window, linker);
248                 buf = MemFree(buf);
249                 SeqPortFree (spp);
250                 slp = s_SlpDust (slp, id, reg, nreg, loopDustMax);
251 /* find tail - this way avoids referencing the pointer */
252                 while (reg->next) reg = reg->next;
253 
254         }
255 
256 /* clean up memory */
257         reg = regold;
258         while (reg)
259         {
260                 regold = reg;
261                 reg = reg->next;
262                 MemFree (regold);
263         }
264 
265         return slp;
266 }
267 
268 
269 static void
270 s_GetSequence(SeqPortPtr spp, Uint1* buf, Int4 buf_length)
271 {
272         Uint1 residue;
273         int index=0;
274 
275         ASSERT(spp && buf && buf_length > 0);
276 
277         while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
278         {
279                 if (residue == SEQPORT_VIRT)
280                         continue;
281                 buf[index] = residue;
282                 index++;
283         }
284         ASSERT(index < buf_length);
285         buf[index] = NULLB;
286         return;
287 }
288 
289 
290 /* Produce locations from dust results. */
291 
292 static SeqLocPtr s_SlpDust (SeqLocPtr slp, SeqIdPtr id,
293                           DREGION *reg, Int4 nreg, Int4 loopDustMax)
294 {
295         SeqIntPtr       sintp;
296         Int4            i;
297         Boolean         flagNoPack;
298         ValNodePtr      vnp = NULL;
299 
300 /* point to dusted locations */
301         if (nreg)
302         {
303 
304 /* loopDustMax == 1 forces PACKED_INT IN - PACKED_INT OUT as needed     */
305                 flagNoPack = FALSE;
306                 if (nreg == 1 && loopDustMax == 1) flagNoPack = TRUE;
307 
308                 if (!slp)
309                 {
310                         if ((slp = ValNodeNew (NULL)) == NULL)
311                         {
312                                 ErrPostEx (SEV_ERROR, 6, 1,
313                                            "val node new failed");
314                                 ErrShow ();
315                                 return slp;
316                         }
317                 }
318 
319                 if (flagNoPack)
320                 {
321                         slp->choice = SEQLOC_INT;
322                 }
323                 else
324                 {
325                         slp->choice = SEQLOC_PACKED_INT;
326                 }
327 
328                 for (i = 0; i < nreg; i++)
329                 {
330                         sintp = SeqIntNew ();
331                         if (!sintp)
332                         {
333                                 ErrPostEx (SEV_FATAL, 6, 2,
334                                            "memory allocation error");
335                                 ErrShow ();
336                                 return slp;
337                         }
338                         sintp->id = SeqIdDup (id);
339                         sintp->from = reg->from;
340                         sintp->to = reg->to;
341                         if (!flagNoPack) ValNodeAddPointer
342                                         (&vnp, SEQLOC_INT, sintp);
343                         reg = reg->next;
344                 }
345 
346                 if (flagNoPack)
347                 {
348                         slp->data.ptrvalue = (Pointer) sintp;
349                 }
350                 else
351                 {
352                         slp->data.ptrvalue = vnp;
353                 }
354         }
355         return slp;
356 }
357 
358 /* entry point for dusting */
359 
360 Int4 DustSegs (Uint1* sequence, Int4 length, Int4 start,
361                        DREGION* reg,
362                        Int4 level, Int4 windowsize, Int4 linker)
363 {
364    Int4    len;
365    Int4 i;
366    Uint1* seq;
367    DREGION* regold = NULL;
368    DCURLOC      cloc;
369    Int4 nreg;
370    /* Default values. */
371    const int kDustLevel = 20;
372    const int kDustWindow = 64;
373    const int kDustLinker = 1;
374    
375    /* defaults are more-or-less in keeping with original dust */
376    if (level < 2 || level > 64) level = kDustLevel;
377    if (windowsize < 8 || windowsize > 64) windowsize = kDustWindow;
378    if (linker < 1 || linker > 32) linker = kDustLinker;
379    
380    nreg = 0;
381    seq = (Uint1*) calloc(1, windowsize);                        /* triplets */
382    if (!seq) {
383       return -1;
384    }
385    SUM_THRESHOLD = (Uint1 *) calloc(windowsize, sizeof(Uint1));  
386    if (!SUM_THRESHOLD) {
387       return -1;
388    }
389    SUM_THRESHOLD[0] = 1;
390    for (i=1; i < windowsize; i++)
391        SUM_THRESHOLD[i] = (level * i)/10;
392 
393    if (length < windowsize) windowsize = length;
394 
395    /* Consider smaller windows in beginning of the sequence */
396    for (i = 2; i <= windowsize-1; i++) {
397       len = i-1;
398       wo (len, sequence, 0, &cloc, seq, 1, level);
399       
400       if (cloc.cursum*10 > level*cloc.curlength) {
401          if (nreg &&
402              regold->to + linker >= cloc.curstart+start &&
403              regold->from <= cloc.curend + start + linker) {
404             /* overlap windows nicely if needed */
405             if (regold->to < cloc.curend +  start)
406                 regold->to = cloc.curend +  start;
407             if (regold->from > cloc.curstart + start)
408                 regold->from = cloc.curstart + start;
409          } else {
410             /* new window or dusted regions do not overlap */
411             reg->from = cloc.curstart + start;
412             reg->to = cloc.curend + start;
413             regold = reg;
414             reg = (DREGION*) calloc(1, sizeof(DREGION));
415             if (!reg) {
416                MemFree(seq);
417                MemFree(SUM_THRESHOLD);
418                return -1;
419             }
420             reg->next = NULL;
421             regold->next = reg;
422             nreg++;
423          }
424       }                         /* end 'if' high score  */
425    }                                    /* end for */
426 
427    for (i = 1; i < length-2; i++) {
428       len = (Int4) ((length > i+windowsize) ? windowsize : length-i);
429       len -= 2;
430       if (length >= i+windowsize)
431           wo (len, sequence, i, &cloc, seq, 2, level);
432       else /* remaining portion of sequence is less than windowsize */
433           wo (len, sequence, i, &cloc, seq, 3, level);
434       
435       if (cloc.cursum*10 > level*cloc.curlength) {
436          if (nreg &&
437              regold->to + linker >= cloc.curstart+i+start &&
438              regold->from <= cloc.curend + i + start + linker) {
439             /* overlap windows nicely if needed */
440             if (regold->to < cloc.curend + i + start)
441                 regold->to = cloc.curend + i + start;
442             if (regold->from > cloc.curstart + i + start)
443                 regold->from = cloc.curstart + i + start;
444          } else {
445             /* new window or dusted regions do not overlap */
446             reg->from = cloc.curstart + i + start;
447             reg->to = cloc.curend + i + start;
448             regold = reg;
449             reg = (DREGION*) calloc(1, sizeof(DREGION));
450             if (!reg) {
451                MemFree(seq);
452                MemFree(SUM_THRESHOLD);
453                return -1;
454             }
455             reg->next = NULL;
456             regold->next = reg;
457             nreg++;
458          }
459       }                         /* end 'if' high score  */
460    }                                    /* end for */
461    MemFree (seq);
462    MemFree(SUM_THRESHOLD);
463    return nreg;
464 }
465 
466 static void wo (Int4 len, Uint1* seq_start, Int4 iseg, DCURLOC* cloc, 
467                 Uint1* seq, Uint1 FIND_TRIPLET, Int4 level)
468 {
469         Int4 smaller_window_start, mask_window_end;
470         Boolean SINGLE_TRIPLET;
471 
472         cloc->cursum = 0;
473         cloc->curlength = 1;
474         cloc->curstart = 0;
475         cloc->curend = 0;
476 
477         if (len < 1)
478                 return;
479 
480         /* get the chunk of sequence in triplets */
481         if (FIND_TRIPLET==1) /* Append one */
482         {
483                 seq[len-1] = seq[len] = seq[len+1] = 0;
484                 dust_triplet_find (seq_start, iseg+len-1, 1, seq+len-1);
485         }
486         if (FIND_TRIPLET==2) /* Copy suffix as prefix and find one */
487         {
488                 memmove(seq,seq+1,(len-1)*sizeof(Uint1));
489                 seq[len-1] = seq[len] = seq[len+1] = 0;
490                 dust_triplet_find (seq_start, iseg+len-1, 1, seq+len-1);
491         }
492         if (FIND_TRIPLET==3) /* Copy suffix */
493                 memmove(seq,seq+1,len*sizeof(Uint1));
494 
495         /* dust the chunk */
496         SINGLE_TRIPLET = wo1 (len, seq, 0, cloc); /* dust at start of window */
497 
498         /* consider smaller windows only if anything interesting 
499            found for starting position  and smaller windows have potential of
500            being at higher level */
501         if ((cloc->cursum*10 > level*cloc->curlength) && (!SINGLE_TRIPLET)) {
502                 mask_window_end = cloc->curend-1;
503                 smaller_window_start = 1;
504                 while ((smaller_window_start < mask_window_end) &&
505                        (!SINGLE_TRIPLET)) {
506                         SINGLE_TRIPLET = wo1(mask_window_end-smaller_window_start,
507                              seq+smaller_window_start, smaller_window_start, cloc);
508                         smaller_window_start++;
509                 }
510         }
511 
512         cloc->curend += cloc->curstart;
513 }
514 
515 /** returns TRUE if there is single triplet in the sequence considered */
516 static Boolean wo1 (Int4 len, Uint1* seq, Int4 iwo, DCURLOC* cloc)
517 {
518    Uint4 sum;
519         Int4 loop;
520 
521         Int2* countsptr;
522         Int2 counts[4*4*4];
523         Uint1 triplet_count = 0;
524 
525         memset (counts, 0, sizeof (counts));
526 /* zero everything */
527         sum = 0;
528 
529 /* dust loop -- specific for triplets   */
530         for (loop = 0; loop < len; loop++)
531         {
532                 countsptr = &counts[*seq++];
533                 if (*countsptr)
534                 {
535                         sum += (Uint4)(*countsptr);
536 
537                     if (sum >= SUM_THRESHOLD[loop])
538                     {
539                         if ((Uint4)cloc->cursum*loop < sum*cloc->curlength)
540                         {
541                                 cloc->cursum = sum;
542                                 cloc->curlength = loop;
543                                 cloc->curstart = iwo;
544                                 cloc->curend = loop + 2; /* triplets */
545                         }
546                     }
547                 }
548                 else
549                         triplet_count++;
550                 (*countsptr)++;
551         }
552 
553         if (triplet_count > 1)
554                 return(FALSE);
555         return(TRUE);
556 }
557 
558 /** Fill an array with 2-bit encoded triplets.
559  * @param seq_start Pointer to the start of the sequence in blastna 
560  *                  encoding [in]
561  * @param icur Offset at which to start extracting triplets [in]
562  * @param max Maximal length of the sequence segment to be processed [in]
563  * @param s1 Array of triplets [out]
564  * @return How far was the sequence processed?
565  */
566 static Int4 
567 dust_triplet_find (Uint1* seq_start, Int4 icur, Int4 max, Uint1* s1)
568 {
569    Int4 n;
570    Uint1* s2,* s3;
571    Int2 c;
572    Uint1* seq = &seq_start[icur];
573    Uint1 end_byte = ncbi4na_to_blastna[NULLB];
574    const int k_NCBI2na_mask =  0x03;
575    
576    n = 0;
577    
578    s2 = s1 + 1;
579    s3 = s1 + 2;
580    
581    /* set up 1 */
582    if ((c = *seq++) == end_byte)
583       return n;
584    c &= k_NCBI2na_mask;
585    *s1 |= c;
586    *s1 <<= 2;
587    
588    /* set up 2 */
589    if ((c = *seq++) == end_byte)
590       return n;
591    c &= k_NCBI2na_mask;
592    *s1 |= c;
593    *s2 |= c;
594    
595    /* triplet fill loop */
596    while (n < max && (c = *seq++) != end_byte) {
597       c &= k_NCBI2na_mask;
598       *s1 <<= 2;
599       *s2 <<= 2;
600       *s1 |= c;
601       *s2 |= c;
602       *s3 |= c;
603       s1++;
604       s2++;
605       s3++;
606       n++;
607    }
608    
609    return n;
610 }
611 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.