NCBI C++ ToolKit
blast_filter.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_filter.c 50124 2011-06-20 13:16:49Z maning $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 /** @file blast_filter.c
28  * All code related to query sequence masking/filtering for BLAST
29  */
30 
31 #ifndef SKIP_DOXYGEN_PROCESSING
32 static char const rcsid[] =
33  "$Id: blast_filter.c 50124 2011-06-20 13:16:49Z maning $";
34 #endif /* SKIP_DOXYGEN_PROCESSING */
35 
39 
40 /****************************************************************************/
41 /* Constants */
42 const Uint1 kNuclMask = 14; /* N in BLASTNA */
43 const Uint1 kProtMask = 21; /* X in NCBISTDAA */
44 
45 
46 /** Allowed length of the filtering options string. */
47 #define BLASTOPTIONS_BUFFER_SIZE 128
48 
49 
50 /** Copies filtering commands for one filtering algorithm from "instructions" to
51  * "buffer".
52  * ";" is a delimiter for the commands for different algorithms, so it stops
53  * copying when a ";" is found.
54  * Example filtering string: "m L; R -d rodents.lib"
55  * @param instructions filtering commands [in]
56  * @param buffer filled with filtering commands for one algorithm. [out]
57 */
58 static const char *
59 s_LoadOptionsToBuffer(const char *instructions, char* buffer)
60 {
61  Boolean not_started=TRUE;
62  char* buffer_ptr;
63  const char *ptr;
64  Int4 index;
65 
66  ptr = instructions;
67  buffer_ptr = buffer;
68  for (index=0; index<BLASTOPTIONS_BUFFER_SIZE && *ptr != NULLB; index++)
69  {
70  if (*ptr == ';')
71  { /* ";" is a delimiter for different filtering algorithms. */
72  ptr++;
73  break;
74  }
75  /* Remove blanks at the beginning. */
76  if (not_started && *ptr == ' ')
77  {
78  ptr++;
79  }
80  else
81  {
82  not_started = FALSE;
83  *buffer_ptr = *ptr;
84  buffer_ptr++; ptr++;
85  }
86  }
87 
88  *buffer_ptr = NULLB;
89 
90  if (not_started == FALSE)
91  { /* Remove trailing blanks. */
92  buffer_ptr--;
93  while (*buffer_ptr == ' ' && buffer_ptr > buffer)
94  {
95  *buffer_ptr = NULLB;
96  buffer_ptr--;
97  }
98  }
99 
100  return ptr;
101 }
102 
103 /** Parses repeat filtering options string.
104  * @param repeat_options Input character string [in]
105  * @param dbname Database name for repeats filtering [out]
106  */
107 static Int2
108 s_ParseRepeatOptions(const char* repeat_options, char** dbname)
109 {
110  char* ptr;
111 
112  ASSERT(dbname);
113  *dbname = NULL;
114 
115  if (!repeat_options)
116  return 0;
117 
118  ptr = strstr(repeat_options, "-d");
119  if (ptr) {
120  ptr += 2;
121  while (*ptr == ' ' || *ptr == '\t')
122  ++ptr;
123  *dbname = strdup(ptr);
124  }
125  return 0;
126 }
127 
128 /** Parses window masker options string.
129  * @param winmask_options Input character string [in]
130  * @param dbname Database name for window masker filtering [out]
131  * @param taxid Taxonomic ID for window masker filtering [out]
132  */
133 static Int2
134 s_ParseWindowMaskerOptions(const char * winmask_options,
135  char ** dbname,
136  int * taxid)
137 {
138  char* ptr = NULL;
139 
140  ASSERT(dbname);
141  *dbname = NULL;
142 
143  if (!winmask_options)
144  return 0;
145 
146  ptr = strstr(winmask_options, "-d");
147 
148  if (ptr) {
149  char * endp = 0;
150 
151  ptr += 2;
152  while (*ptr == ' ' || *ptr == '\t')
153  ++ptr;
154 
155  *dbname = strdup(ptr);
156 
157  for(endp = *dbname; *endp; ++endp) {
158  if (*endp == ' ' || *endp == '\t') {
159  *endp = (char)0;
160  break;
161  }
162  }
163  } else {
164  ptr = strstr(winmask_options, "-t");
165 
166  if (ptr) {
167  ptr += 2;
168  while (*ptr == ' ' || *ptr == '\t')
169  ++ptr;
170  *taxid = atoi(ptr);
171  }
172  }
173 
174  return 0;
175 }
176 
177 /** Parses options used for dust.
178  * @param ptr buffer containing instructions. [in]
179  * @param level sets level for dust. [out]
180  * @param window sets window for dust [out]
181  * @param linker sets linker for dust. [out]
182 */
183 static Int2
184 s_ParseDustOptions(const char *ptr, int* level, int* window, int* linker)
185 
186 {
188  int arg, index, index1, window_pri=-1, linker_pri=-1, level_pri=-1;
189 
190  arg = 0;
191  index1 = 0;
192  for (index=0; index<BLASTOPTIONS_BUFFER_SIZE; index++)
193  {
194  if (*ptr == ' ' || *ptr == NULLB)
195  {
196  long tmplong;
197  buffer[index1] = NULLB;
198  index1 = 0;
199  switch(arg) {
200  case 0:
201  sscanf(buffer, "%ld", &tmplong);
202  level_pri = tmplong;
203  break;
204  case 1:
205  sscanf(buffer, "%ld", &tmplong);
206  window_pri = tmplong;
207  break;
208  case 2:
209  sscanf(buffer, "%ld", &tmplong);
210  linker_pri = tmplong;
211  break;
212  default:
213  break;
214  }
215 
216  arg++;
217  while (*ptr == ' ')
218  ptr++;
219 
220  /* end of the buffer. */
221  if (*ptr == NULLB)
222  break;
223  }
224  else
225  {
226  buffer[index1] = *ptr; ptr++;
227  index1++;
228  }
229  }
230  if (arg != 0 && arg != 3)
231  return 1;
232 
233  *level = level_pri;
234  *window = window_pri;
235  *linker = linker_pri;
236 
237  return 0;
238 }
239 
240 /** parses a string to set three seg options.
241  * @param ptr buffer containing instructions [in]
242  * @param window returns "window" for seg algorithm. [out]
243  * @param locut returns "locut" for seg. [out]
244  * @param hicut returns "hicut" for seg. [out]
245 */
246 static Int2
247 s_ParseSegOptions(const char *ptr, Int4* window, double* locut, double* hicut)
248 
249 {
251  Int4 arg, index, index1;
252 
253  arg = 0;
254  index1 = 0;
255  for (index=0; index<BLASTOPTIONS_BUFFER_SIZE; index++)
256  {
257  if (*ptr == ' ' || *ptr == NULLB)
258  {
259  long tmplong;
260  double tmpdouble;
261  buffer[index1] = NULLB;
262  index1 = 0;
263  switch(arg) {
264  case 0:
265  sscanf(buffer, "%ld", &tmplong);
266  *window = tmplong;
267  break;
268  case 1:
269  sscanf(buffer, "%le", &tmpdouble);
270  *locut = tmpdouble;
271  break;
272  case 2:
273  sscanf(buffer, "%le", &tmpdouble);
274  *hicut = tmpdouble;
275  break;
276  default:
277  break;
278  }
279 
280  arg++;
281  while (*ptr == ' ')
282  ptr++;
283 
284  /* end of the buffer. */
285  if (*ptr == NULLB)
286  break;
287  }
288  else
289  {
290  buffer[index1] = *ptr; ptr++;
291  index1++;
292  }
293  }
294  if (arg != 0 && arg != 3)
295  return 1;
296 
297  return 0;
298 }
299 
300 /// Wrapper around strcat to ensure we don't do buffer overflows :)
301 /// @param dest string to concatenate to [in|out]
302 /// @param dest_size size of the dest array, modified if dest is grown [in|out]
303 /// @param string2append string to append to dest [in]
304 /// @return the concatenated string or NULL if we run out of memory
305 static char*
306 s_SafeStrCat(char** dest, unsigned int* dest_size, const char* string2append)
307 {
308  size_t dest_length = strlen(*dest);
309  size_t string2append_length = strlen(string2append);
310  if ((dest_length + string2append_length + 1) > *dest_size) {
311  size_t target_size = MAX(string2append_length, dest_length) * 2;
312  *dest = (char*)realloc((void*)*dest, target_size);
313  if (*dest) {
314  (*dest_size) = target_size;
315  } else {
316  sfree(*dest);
317  return 0;
318  }
319  }
320  strcat(*dest, string2append);
321  return *dest;
322 }
323 
324 char*
326 {
327  char* retval = NULL;
328  unsigned int retval_size = 0;
329 
330  if (filtering_options == NULL) {
331  return strdup("F");
332  }
333 
334  retval_size = 64; /* Usually this will suffice */
335  retval = (char*) calloc(retval_size, sizeof(char));
336 
337  if (filtering_options->dustOptions) {
338  if (filtering_options->dustOptions->level == kDustLevel &&
339  filtering_options->dustOptions->window == kDustWindow &&
340  filtering_options->dustOptions->linker == kDustLinker) {
341  if (!s_SafeStrCat(&retval, &retval_size, "L;")) {
342  return 0;
343  }
344  } else {
345  char buffer[24] = { '\0' };
346  snprintf(buffer, sizeof(buffer), "D %d %d %d;",
347  filtering_options->dustOptions->level,
348  filtering_options->dustOptions->window,
349  filtering_options->dustOptions->linker);
350  if (!s_SafeStrCat(&retval, &retval_size, buffer)) {
351  return 0;
352  }
353  }
354  }
355 
356  if (filtering_options->segOptions) {
357  if (filtering_options->segOptions->window == kSegWindow &&
358  filtering_options->segOptions->locut == kSegLocut &&
359  filtering_options->segOptions->hicut == kSegHicut) {
360  if (!s_SafeStrCat(&retval, &retval_size, "L;")) {
361  return 0;
362  }
363  } else {
364  char buffer[24] = { '\0' };
365  snprintf(buffer, sizeof(buffer), "S %d %1.1f %1.1f;",
366  filtering_options->segOptions->window,
367  filtering_options->segOptions->locut,
368  filtering_options->segOptions->hicut);
369  if (!s_SafeStrCat(&retval, &retval_size, buffer)) {
370  return 0;
371  }
372  }
373  }
374 
375  if (filtering_options->repeatFilterOptions) {
376  if (filtering_options->repeatFilterOptions->database) {
377  if (!s_SafeStrCat(&retval, &retval_size, "R -d ")) {
378  return 0;
379  }
380  if (!s_SafeStrCat(&retval, &retval_size,
381  filtering_options->repeatFilterOptions->database)) {
382  return 0;
383  }
384  if (!s_SafeStrCat(&retval, &retval_size, ";")) {
385  return 0;
386  }
387  } else {
388  if (!s_SafeStrCat(&retval, &retval_size, "R;")) {
389  return 0;
390  }
391  }
392  }
393 
394  if (filtering_options->windowMaskerOptions) {
395  if (filtering_options->windowMaskerOptions->taxid != 0) {
396  char buffer[24] = { '\0' };
397  snprintf(buffer, sizeof(buffer), "W -t %d;",
398  filtering_options->windowMaskerOptions->taxid);
399  if (!s_SafeStrCat(&retval, &retval_size, buffer)) {
400  return 0;
401  }
402  } else if (filtering_options->windowMaskerOptions->database) {
403  if (!s_SafeStrCat(&retval, &retval_size, "W -d ")) {
404  return 0;
405  }
406  if (!s_SafeStrCat(&retval, &retval_size,
407  filtering_options->windowMaskerOptions->database)) {
408  return 0;
409  }
410  if (!s_SafeStrCat(&retval, &retval_size, ";")) {
411  return 0;
412  }
413  }
414  }
415 
416  /* Mask at hash is a modifier for other filtering options, as such it
417  * doesn't make sense to apply it by itself */
418  if (SBlastFilterOptionsMaskAtHash(filtering_options)) {
419  if (strlen(retval) != 0) {
420  /* Add mask at hash as a modifier for other filtering options */
421  if (!s_SafeStrCat(&retval, &retval_size, "m;")) {
422  return 0;
423  }
424  } else {
425  /* We still need to set "m" in a filter string (WB-391, WB-394) */
426  /* The string below can be modified into "mF" or "mL" or
427  whatever is decided to be the conventional meaning */
428  if (!s_SafeStrCat(&retval, &retval_size, "m;")) {
429  return 0;
430  }
431  }
432  }
433 
434  return strlen(retval) == 0
435  ? s_SafeStrCat(&retval, &retval_size, "F")
436  : retval;
437 }
438 
439 Int2
441  const char* instructions,
442  SBlastFilterOptions* *filtering_options,
443  Blast_Message* *blast_message)
444 {
445  Boolean mask_at_hash = FALSE; /* the default. */
446  char* buffer;
447  const char* ptr = instructions;
448  char error_buffer[1024];
449  Int2 status = 0;
450  SSegOptions* segOptions = NULL;
451  SDustOptions* dustOptions = NULL;
452  SRepeatFilterOptions* repeatOptions = NULL;
453  SWindowMaskerOptions * winmaskOptions = NULL;
454 
455  *filtering_options = NULL;
456  if (blast_message)
457  *blast_message = NULL;
458 
459  if (instructions == NULL || strcasecmp(instructions, "F") == 0)
460  {
461  SBlastFilterOptionsNew(filtering_options, eEmpty);
462  return status;
463  }
464 
465  buffer = (char*) calloc(strlen(instructions), sizeof(char));
466  /* allow old-style filters when m cannot be followed by the ';' */
467  if (ptr[0] == 'm' && ptr[1] == ' ')
468  {
469  mask_at_hash = TRUE;
470  ptr += 2;
471  }
472 
473  while (*ptr != NULLB)
474  {
475  if (*ptr == 'S')
476  {
477  SSegOptionsNew(&segOptions);
478  ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
479  if (buffer[0] != NULLB)
480  {
481  int window = 0;
482  double locut = .0, hicut = .0;
483  status = s_ParseSegOptions(buffer, &window, &locut, &hicut);
484  if (status)
485  {
486  segOptions = SSegOptionsFree(segOptions);
487  sprintf(error_buffer, "Error parsing filter string: %s", buffer);
488  if (blast_message)
490  error_buffer);
491  sfree(buffer);
492  return status;
493  }
494  segOptions->window = window;
495  segOptions->locut = locut;
496  segOptions->hicut = hicut;
497  }
498  }
499  else if (*ptr == 'D')
500  {
501  SDustOptionsNew(&dustOptions);
502  ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
503  if (buffer[0] != NULLB)
504  {
505  int window = 0, level = 0, linker = 0;
506  status = s_ParseDustOptions(buffer, &level, &window, &linker);
507  if (status)
508  {
509  dustOptions = SDustOptionsFree(dustOptions);
510  sprintf(error_buffer, "Error parsing filter string: %s", buffer);
511  if (blast_message)
513  error_buffer);
514  sfree(buffer);
515  return status;
516  }
517  dustOptions->level = level;
518  dustOptions->window = window;
519  dustOptions->linker = linker;
520  }
521  }
522  else if (*ptr == 'R')
523  {
524  SRepeatFilterOptionsNew(&repeatOptions);
525  ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
526  if (buffer[0] != NULLB)
527  {
528  char* dbname = NULL;
529  status = s_ParseRepeatOptions(buffer, &dbname);
530  if (status)
531  {
532  repeatOptions = SRepeatFilterOptionsFree(repeatOptions);
533  sprintf(error_buffer, "Error parsing filter string: %s", buffer);
534  if (blast_message)
536  error_buffer);
537  sfree(buffer);
538  return status;
539  }
540  if (dbname)
541  {
542  sfree(repeatOptions->database);
543  repeatOptions->database = dbname;
544  }
545  }
546  }
547  else if (*ptr == 'W')
548  {
549  SWindowMaskerOptionsNew(&winmaskOptions);
550 
551  ptr = s_LoadOptionsToBuffer(ptr+1, buffer);
552  if (buffer[0] != NULLB) {
553  char* dbname = NULL;
554  int taxid = 0;
555 
556  status = s_ParseWindowMaskerOptions(buffer, &dbname, &taxid);
557  if (status) {
558  winmaskOptions = SWindowMaskerOptionsFree(winmaskOptions);
559  sprintf(error_buffer, "Error parsing filter string: %s", buffer);
560  if (blast_message)
562  error_buffer);
563 
564  sfree(buffer);
565  return status;
566  }
567  if (dbname) {
568  sfree(winmaskOptions->database);
569  winmaskOptions->database = dbname;
570  }
571  if (taxid) {
572  winmaskOptions->taxid = taxid;
573  }
574  }
575  }
576  else if (*ptr == 'L' || *ptr == 'T')
577  { /* do low-complexity filtering; dust for blastn, otherwise seg.*/
578  if (program_number == eBlastTypeBlastn)
579  SDustOptionsNew(&dustOptions);
580  else
581  SSegOptionsNew(&segOptions);
582  ptr++;
583  }
584  else if (*ptr == 'm')
585  {
586  mask_at_hash = TRUE;
587  ptr++;
588  }
589  else
590  { /* Nothing applied */
591  ptr++;
592  }
593  }
594  sfree(buffer);
595 
596  status = SBlastFilterOptionsNew(filtering_options, eEmpty);
597  if (status)
598  return status;
599 
600  (*filtering_options)->dustOptions = dustOptions;
601  (*filtering_options)->segOptions = segOptions;
602  (*filtering_options)->repeatFilterOptions = repeatOptions;
603  (*filtering_options)->windowMaskerOptions = winmaskOptions;
604  (*filtering_options)->mask_at_hash = mask_at_hash;
605 
606  return status;
607 }
608 
609 
611 {
612  BlastSeqLoc* loc = (BlastSeqLoc*) calloc(1, sizeof(BlastSeqLoc));
613  if ( !loc ) {
614  return NULL;
615  }
616  loc->ssr = (SSeqRange*) calloc(1, sizeof(SSeqRange));
617  loc->ssr->left = from;
618  loc->ssr->right = to;
619 
620  return BlastSeqLocAppend(head, loc);
621 }
622 
624 {
625  if ( !node ) {
626  return NULL;
627  }
628 
629  if (head)
630  {
631  if (*head)
632  {
633  BlastSeqLoc* tmp = *head;
634  while (tmp->next)
635  tmp = tmp->next;
636  tmp->next = node;
637  }
638  else
639  {
640  *head = node;
641  }
642  }
643 
644  return node;
645 }
646 
647 /** Makes a copy of the BlastSeqLoc and also a copy of the
648  * SSRange element. Does not copy BlastSeqLoc that is pointed
649  * to by "next".
650  * @param source the object to be copied [in]
651  * @return another BlastSeqLoc*
652  */
654 {
655  if ( !source ) {
656  return NULL;
657  }
658  ASSERT(source->ssr);
659  return BlastSeqLocNew(NULL, source->ssr->left, source->ssr->right);
660 }
661 
662 /** Calculates number of links in a chain of BlastSeqLoc's.
663  * @param var Chain of BlastSeqLoc structures [in]
664  * @return Number of links in the chain.
665  */
667 {
668  BlastSeqLoc* itr = NULL;
669  Int4 retval = 0;
670 
671  for (itr = (BlastSeqLoc*)var; itr; itr = itr->next, retval++) {
672  ;
673  }
674  return retval;
675 }
676 
677 /** Converts a BlastSeqLoc list to an array of pointers, each pointing to an
678  * element of the list passed in to this function and the last element points
679  * to NULL
680  * @param list List to convert to an array of pointers [in]
681  * @param count number of elements populated in the array [out]
682  */
683 static BlastSeqLoc**
685 {
686  BlastSeqLoc* tmp,** retval;
687  Int4 i;
688  *count = 0;
689 
690  if (list == NULL)
691  return NULL;
692 
693  *count = s_BlastSeqLocLen(list);
694  retval = (BlastSeqLoc**) calloc(((size_t)(*count)+1), sizeof(BlastSeqLoc*));
695  for (tmp = (BlastSeqLoc*)list, i = 0; tmp != NULL && i < *count; i++) {
696  retval[i] = tmp;
697  tmp = tmp->next;
698  }
699  return retval;
700 }
701 
702 /** Reverse elements in the list
703  * @param head pointer to pointer to the head of the list. [in|out]
704  * (this is not declared static so that it can be tested in the unit tests
705  */
708 {
709  BlastSeqLoc** ptrs = NULL; /* array of pointers to BlastSeqLoc elements */
710  Int4 num_elems = 0, i = 0;
711 
712  if ( !head ) {
713  return;
714  }
715 
716  ptrs = s_BlastSeqLocListToArrayOfPointers(*head, &num_elems);
717  if (num_elems == 0) {
718  return;
719  }
720  ASSERT(ptrs);
721  *head = ptrs[num_elems-1];
722  for (i = num_elems-1; i > 0; i--) {
723  ptrs[i]->next = ptrs[i-1];
724  }
725  ptrs[0]->next = NULL;
726  sfree(ptrs);
727 }
728 
730 {
731  if ( !loc ) {
732  return NULL;
733  }
734  sfree(loc->ssr);
735  sfree(loc);
736  return NULL;
737 }
738 
740 {
741  while (loc) {
742  BlastSeqLoc* next_loc = loc->next;
743  loc = BlastSeqLocNodeFree(loc);
744  loc = next_loc;
745  }
746  return NULL;
747 }
748 
750 {
751  BlastSeqLoc* retval = NULL;
752  BlastSeqLoc* retval_tail = NULL;
753 
754  for (; head; head = head->next) {
755  retval_tail = BlastSeqLocAppend(retval_tail ? &retval_tail : &retval,
756  s_BlastSeqLocNodeDup(head));
757  }
758 
759  return retval;
760 }
761 
763 {
764  BlastMaskLoc* retval = (BlastMaskLoc *) calloc(1, sizeof(BlastMaskLoc));
765  retval->total_size = total;
766  if (total > 0)
767  retval->seqloc_array = (BlastSeqLoc **) calloc(total,
768  sizeof(BlastSeqLoc *));
769  return retval;
770 }
771 
773 {
774  BlastMaskLoc* retval = NULL;
775  Int4 index = 0;
776 
777  if ( !mask_loc ) {
778  return NULL;
779  }
780 
781  retval = BlastMaskLocNew(mask_loc->total_size);
782 
783  for (index = 0; index < mask_loc->total_size; index++) {
784  retval->seqloc_array[index] =
785  BlastSeqLocListDup(mask_loc->seqloc_array[index]);
786  }
787 
788  return retval;
789 }
790 
792 {
793  Int4 index;
794 
795  if (mask_loc == NULL)
796  return NULL;
797 
798  for (index=0; index<mask_loc->total_size; index++)
799  {
800  if (mask_loc->seqloc_array != NULL)
801  BlastSeqLocFree(mask_loc->seqloc_array[index]);
802  }
803  sfree(mask_loc->seqloc_array);
804  sfree(mask_loc);
805  return NULL;
806 }
807 
809  const BlastQueryInfo* query_info)
810 {
811  Uint4 seq_index;
812  BlastSeqLoc* dna_seqlocs[NUM_FRAMES];
813 
814  if (!mask_loc)
815  return 0;
816 
817  /* Check that the array size in BlastMaskLoc corresponds to the number
818  of contexts in BlastQueryInfo. */
819  ASSERT(mask_loc->total_size == query_info->last_context + 1);
820 
821  /* Loop over multiple DNA sequences */
822  for (seq_index = 0; seq_index < (Uint4)query_info->num_queries;
823  ++seq_index) {
824  const Uint4 ctx_idx = NUM_FRAMES * seq_index;
825  const Int4 dna_length = BlastQueryInfoGetQueryLength(query_info,
827  seq_index);
828  Int4 context;
829 
830  /* Save the DNA masking locations, as they'll be freed and overwritten
831  * by their translations */
832  memset((void*) &dna_seqlocs, 0, sizeof(dna_seqlocs));
833  memcpy((void*) &dna_seqlocs,
834  (void*) &mask_loc->seqloc_array[ctx_idx],
835  sizeof(dna_seqlocs));
836  memset((void*) &mask_loc->seqloc_array[ctx_idx], 0, sizeof(dna_seqlocs));
837 
838  /* Reproduce this mask for all 6 frames, with translated coordinates */
839  for (context = 0; context < NUM_FRAMES; ++context) {
840  const Int2 frame = BLAST_ContextToFrame(eBlastTypeBlastx, context);
841  BlastSeqLoc* frame_seqloc = dna_seqlocs[context];
842  BlastSeqLoc* prot_tail = NULL;
843  BlastSeqLoc* itr = NULL;
844 
845  /* If no masks were provided for some frames, use the first one */
846  if (frame_seqloc == NULL && dna_seqlocs[0]) {
847  frame_seqloc = dna_seqlocs[0];
848  }
849  for (itr = frame_seqloc; itr; itr = itr->next) {
850  Int4 from, to;
851  SSeqRange* seq_range = itr->ssr;
852  /* masks should be 0-offset */
853  ASSERT(seq_range->right < dna_length);
854  ASSERT(seq_range->left >= 0);
855  if (frame < 0) {
856  from = (dna_length + frame - seq_range->right)/CODON_LENGTH;
857  to = (dna_length + frame - seq_range->left)/CODON_LENGTH;
858  } else {
859  from = (seq_range->left - frame + 1)/CODON_LENGTH;
860  to = (seq_range->right - frame + 1)/CODON_LENGTH;
861  }
862 
863  if (from < 0)
864  from = 0;
865  if (to < 0)
866  to = 0;
867  if (from >= query_info->contexts[ctx_idx+context].query_length)
868  from = query_info->contexts[ctx_idx+context].query_length - 1;
869  if (to >= query_info->contexts[ctx_idx+context].query_length)
870  to = query_info->contexts[ctx_idx+context].query_length - 1;
871 
872  ASSERT(from >= 0);
873  ASSERT(to >= 0);
874  ASSERT(from < query_info->contexts[ctx_idx+context].query_length);
875  ASSERT(to < query_info->contexts[ctx_idx+context].query_length);
876 
877  /* Cache the tail of the list to avoid the overhead of
878  * traversing the list when appending to it */
879  prot_tail = BlastSeqLocNew((prot_tail
880  ? & prot_tail
881  : & mask_loc->seqloc_array[ctx_idx+context]),
882  from, to);
883  }
884  }
885  for (context = 0; context < NUM_FRAMES; ++context) {
886  BlastSeqLocFree(dna_seqlocs[context]);
887  }
888  }
889 
890  return 0;
891 }
892 
893 
895  const BlastQueryInfo* query_info)
896 {
897  Int2 status = 0;
898  Int4 index;
899 
900  /* If there is not mask, there is nothing to convert to DNA coordinates,
901  hence just return. */
902  if (!mask_loc)
903  return 0;
904 
905  /* Check that the array size in BlastMaskLoc corresponds to the number
906  of contexts in BlastQueryInfo. */
907  ASSERT(mask_loc->total_size == query_info->last_context + 1);
908 
909  /* Loop over all DNA sequences */
910  for (index=0; index < query_info->num_queries; ++index)
911  {
912  Int4 frame_start = index*NUM_FRAMES;
913  Int4 frame_index;
914  Int4 dna_length = BlastQueryInfoGetQueryLength(query_info,
916  index);
917  /* Loop over all frames of one DNA sequence */
918  for (frame_index=frame_start; frame_index<(frame_start+NUM_FRAMES);
919  frame_index++) {
920  BlastSeqLoc* loc;
921  Int2 frame =
922  BLAST_ContextToFrame(eBlastTypeBlastx, frame_index % NUM_FRAMES);
923  /* Loop over all mask locations for a given frame */
924  for (loc = mask_loc->seqloc_array[frame_index]; loc; loc = loc->next) {
925  Int4 from=0, to=0;
926  SSeqRange* seq_range = loc->ssr;
927  if (frame < 0) {
928  to = dna_length - CODON_LENGTH*seq_range->left + frame;
929  from = dna_length - CODON_LENGTH*seq_range->right + frame + 1;
930  } else {
931  from = CODON_LENGTH*seq_range->left + frame - 1;
932  to = CODON_LENGTH*seq_range->right + frame - 1;
933  }
934 
935  if (from < 0)
936  from = 0;
937  if (to < 0)
938  to = 0;
939  if (from >= dna_length)
940  from = dna_length - 1;
941  if (to >= dna_length)
942  to = dna_length - 1;
943 
944  ASSERT(from >= 0);
945  ASSERT(to >= 0);
946  ASSERT(from < dna_length);
947  ASSERT(to < dna_length);
948 
949  seq_range->left = from;
950  seq_range->right = to;
951  }
952  }
953  }
954  return status;
955 }
956 
957 /** Used for qsort, compares two SeqLoc's by starting position. */
958 static int s_SeqRangeSortByStartPosition(const void *vp1, const void *vp2)
959 {
960  BlastSeqLoc* v1 = *((BlastSeqLoc**) vp1);
961  BlastSeqLoc* v2 = *((BlastSeqLoc**) vp2);
962  SSeqRange* loc1 = (SSeqRange*) v1->ssr;
963  SSeqRange* loc2 = (SSeqRange*) v2->ssr;
964 
965  if (loc1->left < loc2->left)
966  return -1;
967  else if (loc1->left > loc2->left)
968  return 1;
969  else
970  return 0;
971 }
972 
973 void
974 BlastSeqLocCombine(BlastSeqLoc** mask_loc, Int4 link_value)
975 {
976  BlastSeqLoc** ptrs = NULL;
977  Int4 i = 0, num_elems = 0;
978 
979  /* Break up the list into an array of pointers and sort it */
980  ptrs = s_BlastSeqLocListToArrayOfPointers(*mask_loc, &num_elems);
981  if (num_elems == 0) {
982  return;
983  }
984  ASSERT(ptrs);
985  qsort(ptrs, (size_t)num_elems, sizeof(*ptrs),
987 
988  /* Merge the overlapping elements */
989  {
990  BlastSeqLoc* curr_tail = *mask_loc = ptrs[0];
991  for (i = 0; i < num_elems - 1; i++) {
992  const SSeqRange* next_ssr = ptrs[i+1]->ssr;
993  const Int4 stop = curr_tail->ssr->right;
994 
995  if ((stop + link_value) > next_ssr->left) {
996  curr_tail->ssr->right = MAX(stop, next_ssr->right);
997  ptrs[i+1] = BlastSeqLocNodeFree(ptrs[i+1]);
998  } else {
999  curr_tail = ptrs[i+1];
1000  }
1001  }
1002  }
1003 
1004  /* Rebuild the linked list */
1005  {
1006  BlastSeqLoc* tail = *mask_loc;
1007  for (i = 1; i < num_elems; i++) {
1008  if (ptrs[i]) {
1009  tail->next = ptrs[i];
1010  tail = ptrs[i];
1011  }
1012  }
1013  tail->next = NULL;
1014  }
1015  sfree(ptrs);
1016 }
1017 
1018 Int2
1020  const BlastQueryInfo* query_info,
1021  const BlastMaskLoc* mask_loc, BlastSeqLoc* *complement_mask)
1022 {
1023  Int4 context;
1024  const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
1025  BlastSeqLoc* tail = NULL; /* Pointer to the tail of the complement_mask
1026  linked list */
1027 
1028  if (complement_mask == NULL)
1029  return -1;
1030 
1031  *complement_mask = NULL;
1032 
1033  for (context = query_info->first_context;
1034  context <= query_info->last_context; ++context) {
1035 
1036  Boolean first = TRUE; /* Specifies beginning of query. */
1037  Boolean last_interval_open=TRUE; /* if TRUE last interval needs to be closed. */
1038  Int4 start_offset, end_offset, filter_start, filter_end;
1039  Int4 left=0, right; /* Used for left/right extent of a region. */
1040  BlastSeqLoc* loc = NULL;
1041 
1042  if (query_info->contexts[context].is_valid == FALSE) {
1043  continue;
1044  }
1045 
1046  start_offset = query_info->contexts[context].query_offset;
1047  end_offset = query_info->contexts[context].query_length
1048  + start_offset - 1;
1049  ASSERT(start_offset <= end_offset);
1050 
1051  /* mask_loc NULL is simply the case that NULL was passed in, which we
1052  take to mean that nothing on query is masked. */
1053  if (mask_loc == NULL || mask_loc->seqloc_array[context] == NULL) {
1054  /* Cache the tail of the list to avoid the overhead of traversing the
1055  * list when appending to it */
1056  tail = BlastSeqLocNew(tail ? &tail : complement_mask,
1057  start_offset, end_offset);
1058  continue;
1059  }
1060 
1061  if (BlastIsReverseStrand(kIsNucl, context)) {
1062  BlastSeqLocListReverse(&mask_loc->seqloc_array[context]);
1063  }
1064  loc = mask_loc->seqloc_array[context];
1065 
1066  first = TRUE;
1067  for ( ; loc; loc = loc->next) {
1068  SSeqRange* seq_range = loc->ssr;
1069  if (BlastIsReverseStrand(kIsNucl, context)) {
1070  filter_start = end_offset - seq_range->right;
1071  filter_end = end_offset - seq_range->left;
1072  } else {
1073  filter_start = start_offset + seq_range->left;
1074  filter_end = start_offset + seq_range->right;
1075  }
1076  /* The canonical "state" at the top of this
1077  while loop is that both "left" and "right" have
1078  been initialized to their correct values.
1079  The first time this loop is entered in a call to
1080  the function this is not true and the following "if"
1081  statement moves everything to the canonical state. */
1082  if (first) {
1083  last_interval_open = TRUE;
1084  first = FALSE;
1085 
1086  if (filter_start > start_offset) {
1087  /* beginning of sequence not filtered */
1088  left = start_offset;
1089  } else {
1090  /* beginning of sequence filtered */
1091  left = filter_end + 1;
1092  continue;
1093  }
1094  }
1095 
1096  right = filter_start - 1;
1097 
1098  /* Cache the tail of the list to avoid the overhead of traversing the
1099  * list when appending to it */
1100  tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right);
1101  if (filter_end >= end_offset) {
1102  /* last masked region at end of sequence */
1103  last_interval_open = FALSE;
1104  break;
1105  } else {
1106  left = filter_end + 1;
1107  }
1108  }
1109 
1110  if (last_interval_open) {
1111  /* Need to finish SSeqRange* for last interval. */
1112  right = end_offset;
1113  /* Cache the tail of the list to avoid the overhead of traversing the
1114  * list when appending to it */
1115  tail = BlastSeqLocNew((tail ? &tail : complement_mask), left, right);
1116  }
1117  }
1118  return 0;
1119 }
1120 
1121 
1122 Int2
1124  Uint1* sequence,
1125  Int4 length,
1126  Int4 offset,
1127  const SBlastFilterOptions* filter_options,
1128  BlastSeqLoc** seqloc_retval,
1129  Blast_Message* *blast_message)
1130 {
1131  Int2 status=0; /* return value. */
1132 
1133  ASSERT(filter_options);
1134  ASSERT(seqloc_retval);
1135 
1136  *seqloc_retval = NULL;
1137 
1138  status = SBlastFilterOptionsValidate(program_number, filter_options,
1139  blast_message);
1140  if (status)
1141  return status;
1142 
1143  if (filter_options->segOptions)
1144  {
1145  SSegOptions* seg_options = filter_options->segOptions;
1146  SegParameters* sparamsp=NULL;
1147 
1148  sparamsp = SegParametersNewAa();
1149  sparamsp->overlaps = TRUE;
1150  if (seg_options->window > 0)
1151  sparamsp->window = seg_options->window;
1152  if (seg_options->locut > 0.0)
1153  sparamsp->locut = seg_options->locut;
1154  if (seg_options->hicut > 0.0)
1155  sparamsp->hicut = seg_options->hicut;
1156 
1157  status = SeqBufferSeg(sequence, length, offset, sparamsp,
1158  seqloc_retval);
1159  SegParametersFree(sparamsp);
1160  sparamsp = NULL;
1161  }
1162 
1163  return status;
1164 }
1165 
1166 void
1167 BlastSeqLocReverse(BlastSeqLoc* masks, Int4 query_length)
1168 {
1169  for(; masks; masks = masks->next) {
1170  masks->ssr->left = query_length - 1 - masks->ssr->right;
1171  masks->ssr->right = query_length - 1 - masks->ssr->left;
1172  }
1173 }
1174 
1175 /** Calculates the mask locations one context at a time.
1176  * @param query_blk sequence [in]
1177  * @param query_info information about sequences [in]
1178  * @param context which context is this? [in]
1179  * @param program_number program (blastn, blastp, etc.) [in]
1180  * @param filter_options instructions for producing mask [in]
1181  * @param filter_out results of filtering operations [out]
1182  * @param blast_message any error or warning messages [out]
1183  * @return zero on success
1184  */
1185 static Int2
1187  const BlastQueryInfo* query_info,
1188  Int4 context,
1189  EBlastProgramType program_number,
1190  const SBlastFilterOptions* filter_options,
1191  BlastSeqLoc* *filter_out,
1192  Blast_Message* *blast_message)
1193 {
1194  Int2 status = 0;
1195  Int4 query_length = 0; /* Length of query described by SeqLocPtr. */
1196  Int4 context_offset;
1197  Uint1 *buffer; /* holds sequence for plus strand or protein. */
1198 
1199  const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
1200 
1201  context_offset = query_info->contexts[context].query_offset;
1202  buffer = &query_blk->sequence[context_offset];
1203 
1204  if (query_info->contexts[context].is_valid == FALSE) {
1205  return 0;
1206  }
1207 
1208  query_length = query_info->contexts[context].query_length;
1209 
1210  status = BlastSetUp_Filter(program_number,
1211  buffer,
1212  query_length,
1213  0,
1214  filter_options,
1215  filter_out,
1216  blast_message);
1217  if (status)
1218  return status;
1219 
1220  if (BlastIsReverseStrand(kIsNucl, context) == TRUE) {
1221  /* Reverse this as it's on minus strand. */
1222  BlastSeqLocReverse(*filter_out, query_length);
1223  }
1224 
1225  /* Extract the mask locations corresponding to this query
1226  (frame, strand), detach it from other masks.
1227  NB: for translated search the mask locations are expected in
1228  protein coordinates. The nucleotide locations must be converted
1229  to protein coordinates prior to the call to BLAST_MainSetUp.
1230  */
1231  {
1232  /* Auxiliary locations for lower-case masking or any other masking
1233  * which occurred outside of CORE BLAST */
1234  BlastSeqLoc *lcase_mask_slp = NULL;
1235  if (query_blk->lcase_mask && query_blk->lcase_mask->seqloc_array)
1236  {
1237  ASSERT(context < query_blk->lcase_mask->total_size);
1238  lcase_mask_slp = query_blk->lcase_mask->seqloc_array[context];
1239  /* Set location list to NULL, to allow safe memory deallocation,
1240  ownership transferred to filter_out below. */
1241  query_blk->lcase_mask->seqloc_array[context] = NULL;
1242  }
1243 
1244  /* Attach the lower case mask locations to the filter locations and
1245  combine them */
1246  BlastSeqLocAppend(filter_out, lcase_mask_slp);
1247  }
1248 
1249  BlastSeqLocCombine(filter_out, 0);
1250 
1251  return 0;
1252 }
1253 
1254 Int2
1256  const BlastQueryInfo* query_info,
1257  EBlastProgramType program_number,
1258  const SBlastFilterOptions* filter_options,
1259  BlastMaskLoc** filter_maskloc,
1260  Blast_Message** blast_message)
1261 {
1262  Int2 status = 0;
1263  Int4 context = 0; /* loop variable. */
1264  const int kNumContexts = query_info->last_context + 1;
1265 
1266  ASSERT(query_info && query_blk && filter_maskloc);
1267 
1268  ASSERT(blast_message);
1269  ASSERT(kNumContexts ==
1270  query_info->num_queries*BLAST_GetNumberOfContexts(program_number));
1271  *filter_maskloc = BlastMaskLocNew(kNumContexts);
1272 
1273  for (context = query_info->first_context;
1274  context <= query_info->last_context; ++context) {
1275 
1276  BlastSeqLoc *filter_per_context = NULL;
1277  status = s_GetFilteringLocationsForOneContext(query_blk,
1278  query_info,
1279  context,
1280  program_number,
1281  filter_options,
1282  &filter_per_context,
1283  blast_message);
1284  if (status) {
1285  Blast_MessageWrite(blast_message, eBlastSevError, context,
1286  "Failure at filtering");
1287  return status;
1288  }
1289 
1290  /* NB: for translated searches filter locations are returned in
1291  protein coordinates, because the DNA lengths of sequences are
1292  not available here. The caller must take care of converting
1293  them back to nucleotide coordinates. */
1294  (*filter_maskloc)->seqloc_array[context] = filter_per_context;
1295  }
1296  return 0;
1297 }
1298 
1299 void
1301  const BlastSeqLoc* mask_loc, Boolean reverse, Int4 offset)
1302 {
1303  const Uint1 kMaskingLetter = is_na ? kNuclMask : kProtMask;
1304  ASSERT(buffer);
1305  for (; mask_loc; mask_loc = mask_loc->next) {
1306 
1307  Int4 index, start, stop;
1308 
1309  if (reverse) {
1310  start = length - 1 - mask_loc->ssr->right;
1311  stop = length - 1 - mask_loc->ssr->left;
1312  } else {
1313  start = mask_loc->ssr->left;
1314  stop = mask_loc->ssr->right;
1315  }
1316 
1317  start -= offset;
1318  stop -= offset;
1319 
1320  ASSERT(start < length);
1321  ASSERT(stop <= length);
1322 
1323  for (index = start; index <= stop; index++)
1324  buffer[index] = kMaskingLetter;
1325  }
1326 }
1327 
1328 void
1330 {
1331  Uint1 *sequence = seq->sequence;
1332  Int4 length = seq->length;
1333  Int4 i;
1334 
1335  for (i = 0; i < length; i++) {
1336  if (sequence[i] >= min_invalid) {
1337  sequence[i] = kProtMask;
1338  }
1339  }
1340 }
1341 
1342 void
1344  const BlastQueryInfo* query_info,
1345  const BlastMaskLoc *filter_maskloc,
1346  EBlastProgramType program_number)
1347 {
1348  const Boolean kIsNucl = (program_number == eBlastTypeBlastn);
1349  Int4 context; /* loop variable. */
1350  Int4 total_length;
1351  Boolean has_mask = FALSE; /* Check for whether filter_maskloc is empty. */
1352  Int4 index; /* loop variable. */
1353 
1354  ASSERT(query_blk);
1355  ASSERT(query_info);
1356  ASSERT(filter_maskloc);
1357 
1358 
1359  for (index=0; index<filter_maskloc->total_size; index++)
1360  {
1361  if (filter_maskloc->seqloc_array[index])
1362  {
1363  has_mask = TRUE;
1364  break;
1365  }
1366  }
1367  if (has_mask == FALSE)
1368  return;
1369 
1370 
1371  total_length = query_info->contexts[query_info->last_context].query_offset
1372  + query_info->contexts[query_info->last_context].query_length + 2;
1373  query_blk->sequence_start_nomask = BlastMemDup(query_blk->sequence_start, total_length);
1374  query_blk->sequence_nomask = query_blk->sequence_start_nomask + 1;
1375  query_blk->nomask_allocated = TRUE;
1376 
1377  for (context = query_info->first_context;
1378  context <= query_info->last_context; ++context) {
1379 
1380  Int4 query_length = 0;
1381  Int4 context_offset = 0;
1382  Uint1 *buffer = NULL; /* holds sequence */
1383 
1384  if (query_info->contexts[context].is_valid == FALSE) {
1385  continue;
1386  }
1387 
1388  query_length = query_info->contexts[context].query_length;
1389 
1390  context_offset = query_info->contexts[context].query_offset;
1391  buffer = &query_blk->sequence[context_offset];
1392  ASSERT(buffer);
1393 
1394  Blast_MaskTheResidues(buffer, query_length, kIsNucl,
1395  filter_maskloc->seqloc_array[context],
1396  BlastIsReverseStrand(kIsNucl, context), 0);
1397  }
1398 }
static Int2 s_ParseSegOptions(const char *ptr, Int4 *window, double *locut, double *hicut)
parses a string to set three seg options.
Definition: blast_filter.c:247
void Blast_MaskTheResidues(Uint1 *buffer, Int4 length, Boolean is_na, const BlastSeqLoc *mask_loc, Boolean reverse, Int4 offset)
Masks the letters in buffer.
Boolean SBlastFilterOptionsMaskAtHash(const SBlastFilterOptions *filter_options)
Queries whether masking should be done only for the lookup table or for the entire search...
static BlastSeqLoc * s_BlastSeqLocNodeDup(BlastSeqLoc *source)
Makes a copy of the BlastSeqLoc and also a copy of the SSRange element.
Definition: blast_filter.c:653
Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number)
This function translates the context number of a context into the frame of the sequence.
Definition: blast_util.c:835
const int kSegWindow
Window that SEG examines at once.
Definition: blast_seg.c:48
static const char * s_LoadOptionsToBuffer(const char *instructions, char *buffer)
Copies filtering commands for one filtering algorithm from "instructions" to "buffer".
Definition: blast_filter.c:59
void BlastSeqLocListReverse(BlastSeqLoc **head)
Reverse elements in the list.
Definition: blast_filter.c:707
Int2 BlastMaskLocDNAToProtein(BlastMaskLoc *mask_loc, const BlastQueryInfo *query_info)
Given a BlastMaskLoc with an array of lists of DNA mask locations, substitutes that array by a new ar...
Definition: blast_filter.c:808
BlastMaskLoc * lcase_mask
Locations to be masked from operations on this sequence: lookup table for query; scanning for subject...
Definition: blast_def.h:265
int taxid
Select masking database for this TaxID.
void * BlastMemDup(const void *orig, size_t size)
Copies memory using memcpy and malloc.
Definition: ncbi_std.c:40
SWindowMaskerOptions * windowMaskerOptions
organism specific filtering with window masker.
Int4 window
initial window size to trigger further work.
Definition: blast_seg.h:50
#define NULLB
terminating byte of a char* string.
Definition: ncbi_std.h:179
signed int Int4
Alias for signed int.
Definition: ncbitype.h:120
static Int2 s_ParseDustOptions(const char *ptr, int *level, int *window, int *linker)
Parses options used for dust.
Definition: blast_filter.c:184
unsigned int Uint4
Alias for unsigned int.
Definition: ncbitype.h:121
All filtering options.
unsigned int BLAST_GetNumberOfContexts(EBlastProgramType program)
Get the number of contexts for a given program.
Definition: blast_util.c:1358
Boolean overlaps
Definition: blast_seg.h:55
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:95
int num_queries
Number of query sequences.
BlastMaskLoc * BlastMaskLocDup(const BlastMaskLoc *mask_loc)
Perform a deep copy of the BlastMaskLoc structure passed to this function.
Definition: blast_filter.c:772
BLAST filtering functions.
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Definition: blast_export.h:65
Int2 SDustOptionsNew(SDustOptions **dust_options)
Allocates memory for SDustOptions, fills in defaults.
Definition: blast_options.c:61
Structure for keeping the query masking information.
Definition: blast_def.h:210
void Blast_MaskUnsupportedAA(BLAST_SequenceBlk *seq, Uint1 min_invalid)
Mask protein letters that are currently unsupported.
const Uint1 kNuclMask
BLASTNA element used to mask bases in BLAST.
Definition: blast_filter.c:42
BlastSeqLoc * BlastSeqLocListDup(BlastSeqLoc *head)
Make a deep copy of the linked list of BlastSeqLoc-s pointed to by its argument.
Definition: blast_filter.c:749
const double kSegLocut
Locut parameter for SEG.
Definition: blast_seg.c:49
#define ASSERT
macro for assert.
Definition: ncbi_std.h:105
int window
initial window to trigger further work.
Int2 SBlastFilterOptionsValidate(EBlastProgramType program_number, const SBlastFilterOptions *filter_options, Blast_Message **blast_message)
Validates filter options to ensure that program and options are consistent and that options have vali...
BlastContextInfo * contexts
Information per context.
BlastSeqLoc ** seqloc_array
Array of masked locations.
Definition: blast_def.h:231
char * database
Nucleotide database for mini BLAST search.
static Int2 s_GetFilteringLocationsForOneContext(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, Int4 context, EBlastProgramType program_number, const SBlastFilterOptions *filter_options, BlastSeqLoc **filter_out, Blast_Message **blast_message)
Calculates the mask locations one context at a time.
double hicut
Definition: blast_seg.h:52
#define NULL
Definition: ncbistd.hpp:225
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Definition: blast_filter.c:739
SRepeatFilterOptions * SRepeatFilterOptionsFree(SRepeatFilterOptions *repeat_options)
Frees SRepeatFilterOptions.
Int4 last_context
Index of the last element of the context array.
const Uint1 kProtMask
NCBISTDAA element used to mask residues in BLAST.
Definition: blast_filter.c:43
Uint1 * sequence_start
Start of sequence, usually one byte before sequence as that byte is a NULL sentinel byte...
Definition: blast_def.h:244
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:115
Int2 BLAST_ComplementMaskLocations(EBlastProgramType program_number, const BlastQueryInfo *query_info, const BlastMaskLoc *mask_loc, BlastSeqLoc **complement_mask)
This function takes the list of mask locations (i.e., regions that should not be searched or not adde...
int i
Structure to hold parameters for seg search.
Definition: blast_seg.h:48
SRepeatFilterOptions * repeatFilterOptions
for organism specific repeat filtering.
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
BlastSeqLoc * BlastSeqLocNodeFree(BlastSeqLoc *loc)
Deallocate a single BlastSeqLoc structure and its contents, without following its next pointer...
Definition: blast_filter.c:729
void SegParametersFree(SegParameters *sparamsp)
Free SegParameters structure.
Definition: blast_seg.c:2267
const double kSegHicut
Hicut parameter for SEG.
Definition: blast_seg.c:50
static NCBI_INLINE Boolean BlastIsReverseStrand(Boolean is_na, Int4 context)
Determines whether this is a nucleotide query and whether this a minus strand or not.
Definition: blast_filter.h:328
unsigned char Uint1
Alias for unsigned char.
Definition: ncbitype.h:117
static Int4 s_BlastSeqLocLen(const BlastSeqLoc *var)
Calculates number of links in a chain of BlastSeqLoc's.
Definition: blast_filter.c:666
Various auxiliary BLAST utility functions.
Options for dust algorithm, applies only to nucl.
const char * database
Use winmasker database at this location.
static Int2 s_ParseWindowMaskerOptions(const char *winmask_options, char **dbname, int *taxid)
Parses window masker options string.
Definition: blast_filter.c:134
SEG filtering functions.
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
Uint1 * sequence_start_nomask
Query sequence without masking.
Definition: blast_def.h:255
Int2 SWindowMaskerOptionsNew(SWindowMaskerOptions **winmask_options)
Allocates memory for SWindowMaskerOptions, fills in defaults.
Definition: blast_options.c:94
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:99
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
Definition: blast_filter.c:610
const int kDustLevel
Level parameter used by dust.
Definition: blast_options.c:50
Int2 Blast_MessageWrite(Blast_Message **blast_msg, EBlastSeverity severity, int context, const char *message)
Writes a message to a structure.
#define CODON_LENGTH
Codons are always of length 3.
Definition: blast_def.h:63
const CharType(& source)[N]
Definition: pointer.h:1107
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm. ...
Definition: blast_program.h:70
SSegOptions * segOptions
low-complexity filtering for proteins sequences (includes translated nucleotides).
SegParameters * SegParametersNewAa(void)
Allocated SeqParameter struct for proteins and fills with default values.
Definition: blast_seg.c:2220
Structure to hold the a message from the core of the BLAST engine.
Definition: blast_message.h:70
Int2 SeqBufferSeg(Uint1 *sequence, Int4 length, Int4 offset, SegParameters *sparamsp, BlastSeqLoc **seg_locs)
Runs seg on a protein sequence in ncbistdaa.
Definition: blast_seg.c:2276
BlastSeqLoc * BlastSeqLocAppend(BlastSeqLoc **head, BlastSeqLoc *node)
Appends the BlastSeqLoc to the list of BlastSeqLoc-s pointed to by head.
Definition: blast_filter.c:623
Filtering options for organism-specific filtering with Window Masker.
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
double locut
Definition: blast_seg.h:51
Int2 BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, EBlastProgramType program_number, const SBlastFilterOptions *filter_options, BlastMaskLoc **filter_maskloc, Blast_Message **blast_message)
Does preparation for filtering and then calls BlastSetUp_Filter.
SDustOptions * dustOptions
low-complexity filtering for nucleotides.
const CVect3< U > & v2
Definition: globals.hpp:449
Int2 SSegOptionsNew(SSegOptions **seg_options)
Allocates memory for SSegOptions, fills in defaults.
Definition: blast_options.c:81
BlastMaskLoc * BlastMaskLocFree(BlastMaskLoc *mask_loc)
Deallocate memory for a BlastMaskLoc structure as well as the BlastSeqLoc's pointed to...
Definition: blast_filter.c:791
static Int2 s_ParseRepeatOptions(const char *repeat_options, char **dbname)
Parses repeat filtering options string.
Definition: blast_filter.c:108
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:5915
Int4 query_length
Length of this query, strand or frame.
Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo *qinfo, EBlastProgramType program, Int4 query_index)
Obtains the sequence length for a given query in the query, without taking into consideration any app...
no filtering at all.
Boolean nomask_allocated
If false the two above are just pointers to sequence and sequence_start.
Definition: blast_def.h:257
Options for SEG algorithm, applies only to protein-protein comparisons.
static BlastSeqLoc ** s_BlastSeqLocListToArrayOfPointers(const BlastSeqLoc *list, Int4 *count)
Converts a BlastSeqLoc list to an array of pointers, each pointing to an element of the list passed i...
Definition: blast_filter.c:684
Uint1 * sequence_nomask
Start of query sequence without masking.
Definition: blast_def.h:256
Filtering options for organsim specific repeats filtering.
Boolean is_valid
Determine if this context is valid or not.
static char * s_SafeStrCat(char **dest, unsigned int *dest_size, const char *string2append)
Wrapper around strcat to ensure we don't do buffer overflows :)
Definition: blast_filter.c:306
A structure containing two integers, used e.g.
Definition: blast_def.h:155
void BlastSeqLocCombine(BlastSeqLoc **mask_loc, Int4 link_value)
Go through all mask locations in one sequence and combine any that overlap, deallocating the unneeded...
Definition: blast_filter.c:974
void BlastSetUp_MaskQuery(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastMaskLoc *filter_maskloc, EBlastProgramType program_number)
Masks the sequence given a BlastMaskLoc.
int linker
min distance to link segments.
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:92
Int2 SRepeatFilterOptionsNew(SRepeatFilterOptions **repeat_options)
Allocates memory for SRepeatFilterOptions, fills in defaults.
Structure to hold a sequence.
Definition: blast_def.h:242
void BlastSeqLocReverse(BlastSeqLoc *masks, Int4 query_length)
Converts reverse strand coordinates to forward strand in place.
Int2 BlastFilteringOptionsFromString(EBlastProgramType program_number, const char *instructions, SBlastFilterOptions **filtering_options, Blast_Message **blast_message)
Produces SBlastFilterOptions from a string that has been traditionally supported in blast...
Definition: blast_filter.c:440
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
SWindowMaskerOptions * SWindowMaskerOptionsFree(SWindowMaskerOptions *winmask_options)
Frees SWindowMaskerOptions.
static int s_SeqRangeSortByStartPosition(const void *vp1, const void *vp2)
Used for qsort, compares two SeqLoc's by starting position.
Definition: blast_filter.c:958
Int4 first_context
Index of the first element of the context array.
SSegOptions * SSegOptionsFree(SSegOptions *seg_options)
Frees SSegOptions.
Definition: blast_options.c:74
#define NUM_FRAMES
Number of frames to which we translate in translating searches.
Definition: blast_def.h:88
const int kDustLinker
Parameter used by dust to link together close low-complexity segments.
Definition: blast_options.c:52
char * BlastFilteringOptionsToString(const SBlastFilterOptions *filtering_options)
Convert the filtering options structure to a string.
Definition: blast_filter.c:325
The query related information.
const int kBlastMessageNoContext
Declared in blast_message.h as extern const.
Definition: blast_message.c:41
#define BLASTOPTIONS_BUFFER_SIZE
Allowed length of the filtering options string.
Definition: blast_filter.c:47
#define strdup
Definition: ncbi_ansi_ext.h:52
Int2 SBlastFilterOptionsNew(SBlastFilterOptions **filter_options, EFilterOptions type)
Allocates memory for SBlastFilterOptions and.
Int2 BlastMaskLocProteinToDNA(BlastMaskLoc *mask_loc, const BlastQueryInfo *query_info)
Given a BlastMaskLoc with an array of lists of mask locations per protein frame, recalculates all mas...
Definition: blast_filter.c:894
SDustOptions * SDustOptionsFree(SDustOptions *dust_options)
Frees SDustOptions.
Definition: blast_options.c:54
const int kDustWindow
Window parameter used by dust.
Definition: blast_options.c:51
Int2 BlastSetUp_Filter(EBlastProgramType program_number, Uint1 *sequence, Int4 length, Int4 offset, const SBlastFilterOptions *filter_options, BlastSeqLoc **seqloc_retval, Blast_Message **blast_message)
Runs seg filtering functions, according to the filtering options, returns BlastSeqLoc*.
Int4 total_size
Total size of the BlastSeqLoc array below.
Definition: blast_def.h:218
static char const rcsid[]
Definition: blast_filter.c:32
signed short Int2
Alias for signed short.
Definition: ncbitype.h:118
#define strcasecmp
Definition: ncbi_ansi_ext.h:87
static uschar * buffer
Definition: pcretest.c:187
BlastMaskLoc * BlastMaskLocNew(Int4 total)
Allocate memory for a BlastMaskLoc.
Definition: blast_filter.c:762
Int4 length
Length of sequence.
Definition: blast_def.h:246
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Modified on Sun Jul 05 13:02:40 2015 by modify_doxy.py rev. 426318