NCBI C Toolkit Cross Reference

C/algo/blast/api/repeats_filter.c


  1 #ifndef SKIP_DOXYGEN_PROCESSING
  2 static char const rcsid[] = "$Id: repeats_filter.c,v 1.19 2007/12/19 21:56:48 camacho Exp $";
  3 #endif /* SKIP_DOXYGEN_PROCESSING */
  4 
  5 /*
  6  * ===========================================================================
  7  *
  8  *                            PUBLIC DOMAIN NOTICE
  9  *               National Center for Biotechnology Information
 10  *
 11  *  This software/database is a "United States Government Work" under the
 12  *  terms of the United States Copyright Act.  It was written as part of
 13  *  the author's official duties as a United States Government employee and
 14  *  thus cannot be copyrighted.  This software/database is freely available
 15  *  to the public for use. The National Library of Medicine and the U.S.
 16  *  Government have not placed any restriction on its use or reproduction.
 17  *
 18  *  Although all reasonable efforts have been taken to ensure the accuracy
 19  *  and reliability of the software and data, the NLM and the U.S.
 20  *  Government do not and cannot warrant the performance or results that
 21  *  may be obtained by using this software or data. The NLM and the U.S.
 22  *  Government disclaim all warranties, express or implied, including
 23  *  warranties of performance, merchantability or fitness for any particular
 24  *  purpose.
 25  *
 26  *  Please cite the author in any work or product based on this material.
 27  *
 28  * ===========================================================================
 29  *
 30  * File Name:  $RCSfile: repeats_filter.c,v $
 31  *
 32  * Author: Ilya Dondoshansky
 33  *
 34  */
 35 
 36 /** @file repeats_filter.c
 37  * Repeats filtering API for the new BLAST code
 38  */
 39 
 40 /* Prototypes of functions defined below */
 41 #include <algo/blast/api/repeats_filter.h>
 42 #include <algo/blast/api/blast_api.h>
 43 #include <algo/blast/core/blast_filter.h>
 44 #include <algo/blast/core/blast_util.h>
 45 #include <algo/blast/api/blast_seq.h>
 46 #include <algo/blast/api/seqsrc_readdb.h>
 47 
 48 /** @addtogroup CToolkitAlgoBlast
 49  *
 50  * @{
 51  */
 52 
 53 /** Sets options for a repeats search.
 54  * @param options Options wrapper structure to modify [in] [out]
 55  */
 56 static Int2 
 57 s_SetRepeatsSearchOptions(SBlastOptions* options)
 58 {
 59     Int2 status = 0;
 60 
 61     if (!options || !options->score_options || !options->word_options ||
 62         !options->ext_options)
 63         return -1;
 64 
 65     if ((status = SBlastOptionsSetEvalue(options, REPEATS_SEARCH_EVALUE)))
 66         return status;
 67     if ((status = SBlastOptionsSetWordSize(options, REPEATS_SEARCH_WORD_SIZE)))
 68         return status;
 69     if ((status = 
 70          SBlastOptionsSetFilterString(options, REPEATS_SEARCH_FILTER_STRING)))
 71         return status;
 72 
 73     options->lookup_options->lut_type = eNaLookupTable;
 74     options->score_options->penalty = REPEATS_SEARCH_PENALTY;
 75     options->score_options->gap_open = REPEATS_SEARCH_GAP_OPEN;
 76     options->score_options->gap_extend = REPEATS_SEARCH_GAP_EXTEND;
 77     options->word_options->x_dropoff = REPEATS_SEARCH_XDROP_UNGAPPED;
 78     options->ext_options->gap_x_dropoff_final = REPEATS_SEARCH_XDROP_FINAL;
 79 
 80     return status;
 81 }
 82 
 83 /** Create a SeqLoc with repeat masking locations, given the results of a 
 84  * BLAST search against a database of repeats.
 85  * @param query_seqloc Query sequence locations [in]
 86  * @param results Internal results structure, returned from a BLAST search
 87  *                against a repeats database [in]
 88  * @param mask_seqloc List of ValNode's, one per query, containing mask 
 89  *                    locations. [out] 
 90  */
 91 static Int2
 92 s_FillMaskLocFromBlastHSPResults(SeqLoc* query_seqloc, BlastHSPResults* results,
 93                                  SeqLoc* *mask_seqloc)
 94 {
 95     Int4 num_seqs = 0;
 96     Int4 query_index;
 97     SeqLoc* slp;
 98     BlastMaskLoc* mask;
 99     const EBlastProgramType kProgram = eBlastTypeBlastn;
100     const Uint4 kNumContexts = BLAST_GetNumberOfContexts(eBlastTypeBlastn);
101 
102     if (!query_seqloc || !mask_seqloc)
103         return -1;
104 
105     *mask_seqloc = NULL;
106 
107     if (!results) {
108         return 0;
109     }
110 
111     num_seqs = ValNodeLen(query_seqloc);
112     mask = BlastMaskLocNew(num_seqs*kNumContexts);
113 
114     for (query_index = 0, slp = query_seqloc; slp;
115          ++query_index, slp = slp->next) {
116         Int4 query_length, query_start;
117         Int4 hit_index;
118         BlastSeqLoc* loc_list = NULL, *ordered_loc_list = NULL;
119         BlastHitList* hit_list = results->hitlist_array[query_index];
120        
121         if (!hit_list) {
122             continue;
123         }
124         query_length = SeqLocLen(slp);
125         query_start = SeqLocStart(slp);
126 
127         /* Find all HSP intervals in query */
128         for (hit_index = 0; hit_index < hit_list->hsplist_count; ++hit_index) {
129             Int4 hsp_index;
130             BlastHSPList* hsp_list = hit_list->hsplist_array[hit_index];
131             /* HSP lists cannot be NULL! */
132             ASSERT(hsp_list);
133             for (hsp_index = 0; hsp_index < hsp_list->hspcnt; ++hsp_index) {
134                 Int4 left, right;
135                 BlastHSP* hsp = hsp_list->hsp_array[hsp_index];
136                 /* HSP cannot be NULL! */
137                 ASSERT(hsp);
138                 if (hsp->query.frame == hsp->subject.frame) {
139                     left = hsp->query.offset;
140                     right = hsp->query.end - 1;
141                 } else {
142                     left = query_length - hsp->query.end;
143                     right = query_length - hsp->query.offset - 1;
144                 }
145                 /* Shift the coordinates so they correspond to the full 
146                    sequence. */
147                 left += query_start;
148                 right += query_start;
149                 BlastSeqLocNew(&loc_list, left, right);
150             }
151         }
152         /* Make the intervals unique */
153         BlastSeqLocCombine(&loc_list, REPEAT_MASK_LINK_VALUE);
154         ordered_loc_list = loc_list;
155         loc_list = NULL;
156 
157         mask->seqloc_array[query_index*kNumContexts] = ordered_loc_list;
158     }
159 
160     *mask_seqloc = BlastMaskLocToSeqLoc(kProgram, mask, query_seqloc);
161 
162     mask = BlastMaskLocFree(mask);
163 
164     return 0;
165 }
166 
167 Int2
168 Blast_FindRepeatFilterSeqLoc(SeqLoc* query_seqloc,
169                              const char* filter_string, 
170                              SeqLoc* *mask_loc,
171                              SBlastMessage **message)
172 {
173     char* repeat_database = NULL;
174     SBlastOptions* options = NULL;
175     Blast_SummaryReturn* sum_returns = NULL;
176     Int2 status = 0;
177     BlastSeqSrc* seq_src = NULL;
178     SeqLoc* filter_loc = NULL; /* Dummy variable, since search will be performed 
179                                   without filtering. */
180     BlastHSPResults* results = NULL;
181     SBlastFilterOptions* filtering_options = NULL;
182 
183     ASSERT(message);
184     if (filter_string == NULL)
185        return 0;
186     
187     status = BlastFilteringOptionsFromString(eBlastTypeBlastn, filter_string,
188         &filtering_options, NULL);
189 
190     if (status)
191       return status;
192 
193     /* If repeat filtering not requested, return success. */
194     if (filtering_options == NULL || filtering_options->repeatFilterOptions == NULL)
195     {
196         filtering_options = SBlastFilterOptionsFree(filtering_options);
197         return 0;
198     }
199     
200     repeat_database = filtering_options->repeatFilterOptions->database;
201 
202     sum_returns = Blast_SummaryReturnNew();
203     status = SBlastOptionsNew("blastn", &options, sum_returns);
204     
205     seq_src = ReaddbBlastSeqSrcInit(repeat_database, FALSE, 0, 0);
206     
207     if (!seq_src) {
208         SBlastMessageWrite(&sum_returns->error, SEV_ERROR, 
209            "Initialization of subject sequences source failed", NULL, FALSE);
210     } else {
211         char* error_str = BlastSeqSrcGetInitError(seq_src);
212         if (error_str)
213            SBlastMessageWrite(&sum_returns->error, SEV_ERROR, error_str, NULL, FALSE); 
214     }
215 
216     /* If there was an error initializing the sequence source, return without 
217        doing the search. */
218     if (sum_returns->error) {
219         *mask_loc = NULL;
220         *message = sum_returns->error;
221         sum_returns->error = NULL;
222         Blast_SummaryReturnFree(sum_returns);
223         filtering_options = SBlastFilterOptionsFree(filtering_options);
224         return -1;
225     }
226 
227     s_SetRepeatsSearchOptions(options);
228 
229     status =
230         Blast_RunSearch(query_seqloc, (Blast_PsiCheckpointLoc *) NULL,
231                         seq_src, (SeqLoc*) NULL, options,
232                         (BlastTabularFormatData*) NULL,
233                         &results, &filter_loc, sum_returns);
234 
235     /* The ReadDBFILE structure will not be destroyed here, because the 
236        initialising function used readdb_attach */
237     BlastSeqSrcFree(seq_src);
238 
239     Blast_SummaryReturnFree(sum_returns);
240 
241     /* filter_loc must be NULL on return from Blast_RunSearch, but call the 
242        destruction function anyway - in case changes are made in the future. */
243     filter_loc = Blast_ValNodeMaskListFree(filter_loc);
244 
245     options = SBlastOptionsFree(options);
246     filtering_options = SBlastFilterOptionsFree(filtering_options);
247 
248     if (!status) {
249         /* Extract the repeat locations from the search results */
250         s_FillMaskLocFromBlastHSPResults(query_seqloc, results, mask_loc);
251     }
252 
253     results = Blast_HSPResultsFree(results);
254     return status;
255 }
256 
257 /* @} */
258 
259 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.