NCBI C++ ToolKit
blast_aascan.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_aascan.c 72378 2016-05-04 14:59:01Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  */
26 
27 /** @file blast_aascan.c
28  * Functions for accessing hits in the protein BLAST lookup table.
29  */
30 
33 #include "masksubj.inl"
34 
35 /**
36  * Scans the subject sequence from "offset" to the end of the sequence.
37  * Copies at most array_size hits.
38  * Returns the number of hits found.
39  * If there isn't enough room to copy all the hits, return early, and update
40  * "offset".
41  *
42  * @param lookup_wrap the lookup table [in]
43  * @param subject the subject sequence [in]
44  * @param offset_pairs Array to which hits will be copied [out]
45  * @param array_size length of the offset arrays [in]
46  * @return The number of hits found.
47  */
48 static Int4 s_BlastAaScanSubject(const LookupTableWrap * lookup_wrap,
49  const BLAST_SequenceBlk * subject,
50  BlastOffsetPair * NCBI_RESTRICT offset_pairs,
51  Int4 array_size,
52  Int4 * s_range)
53 {
54  Int4 index;
55  Uint1 *s = NULL;
56  Uint1 *s_first = NULL;
57  Uint1 *s_last = NULL;
58  Int4 numhits = 0; /* number of hits found for a given subject
59  offset */
60  Int4 totalhits = 0; /* cumulative number of hits found */
61  PV_ARRAY_TYPE *pv;
64  Int4 *ovfl;
65  Int4 word_length;
66 
67  ASSERT(lookup_wrap->lut_type == eAaLookupTable);
68  lookup = (BlastAaLookupTable *) lookup_wrap->lut;
69  ASSERT(lookup->bone_type == eBackbone);
70  pv = lookup->pv;
71  bbc = (AaLookupBackboneCell *) lookup->thick_backbone;
72  ovfl = (Int4 *) lookup->overflow;
73  word_length = lookup->word_length;
74 
75  while (s_DetermineScanningOffsets(subject, word_length, word_length, s_range)) {
76  s_first=subject->sequence + s_range[1];
77  s_last=subject->sequence + s_range[2];
78 
79  /* prime the index */
80  index = ComputeTableIndex(word_length - 1,
81  lookup->charsize, s_first);
82 
83  for (s = s_first; s <= s_last; s++) {
84  /* compute the index value */
85  index = ComputeTableIndexIncremental(word_length,
86  lookup->charsize,
87  lookup->mask, s, index);
88 
89  /* if there are hits... */
90  if (PV_TEST(pv, index, PV_ARRAY_BTS)) {
91  numhits = bbc[index].num_used;
92 
93  ASSERT(numhits != 0);
94 
95  /* ...and there is enough space in the destination array, */
96  if (numhits <= (array_size - totalhits))
97  /* ...then copy the hits to the destination */
98  {
99  Int4 *src;
100  if (numhits <= AA_HITS_PER_CELL)
101  /* hits live in thick_backbone */
102  src = bbc[index].payload.entries;
103  else
104  /* hits live in overflow array */
105  src = &(ovfl[bbc[index].payload.overflow_cursor]);
106 
107  /* copy the hits. */
108  {
109  Int4 i;
110  Int4 s_off = s - subject->sequence;
111  for (i = 0; i < numhits; i++) {
112  offset_pairs[i + totalhits].qs_offsets.q_off = src[i];
113  offset_pairs[i + totalhits].qs_offsets.s_off = s_off;
114  }
115  }
116 
117  totalhits += numhits;
118  } else
119  /* not enough space in the destination array; return early */
120  {
121  s_range[1] = s - subject->sequence;
122  return totalhits;
123  }
124  }
125  } /* end for */
126  s_range[1] = s - subject->sequence;
127  } /* end while */
128 
129  /* if we get here, we fell off the end of the sequence */
130  return totalhits;
131 }
132 
133 /** same function for small lookup table */
134 static Int4 s_BlastSmallAaScanSubject(const LookupTableWrap * lookup_wrap,
135  const BLAST_SequenceBlk * subject,
136  BlastOffsetPair * NCBI_RESTRICT offset_pairs,
137  Int4 array_size,
138  Int4 * s_range)
139 {
140  Int4 index;
141  Uint1 *s = NULL;
142  Uint1 *s_first = NULL;
143  Uint1 *s_last = NULL;
144  Int4 numhits = 0; /* number of hits found for a given subject
145  offset */
146  Int4 totalhits = 0; /* cumulative number of hits found */
147  PV_ARRAY_TYPE *pv;
150  Uint2 *ovfl;
151  Int4 word_length;
152 
153  ASSERT(lookup_wrap->lut_type == eAaLookupTable);
154  lookup = (BlastAaLookupTable *) lookup_wrap->lut;
155  ASSERT(lookup->bone_type == eSmallbone);
156  pv = lookup->pv;
157  bbc = (AaLookupSmallboneCell *) lookup->thick_backbone;
158  ovfl = (Uint2 *) lookup->overflow;
159  word_length = lookup->word_length;
160 
161  while (s_DetermineScanningOffsets(subject, word_length, word_length, s_range)) {
162  s_first=subject->sequence + s_range[1];
163  s_last=subject->sequence + s_range[2];
164 
165  /* prime the index */
166  index = ComputeTableIndex(word_length - 1,
167  lookup->charsize, s_first);
168 
169  for (s = s_first; s <= s_last; s++) {
170  /* compute the index value */
171  index = ComputeTableIndexIncremental(word_length,
172  lookup->charsize,
173  lookup->mask, s, index);
174 
175  /* if there are hits... */
176  if (PV_TEST(pv, index, PV_ARRAY_BTS)) {
177  numhits = bbc[index].num_used;
178 
179  ASSERT(numhits != 0);
180 
181  /* ...and there is enough space in the destination array, */
182  if (numhits <= (array_size - totalhits))
183  /* ...then copy the hits to the destination */
184  {
185  Uint2 *src;
186  if (numhits <= AA_HITS_PER_CELL)
187  /* hits live in thick_backbone */
188  src = bbc[index].payload.entries;
189  else
190  /* hits live in overflow array */
191  src = &(ovfl[bbc[index].payload.overflow_cursor]);
192 
193  /* copy the hits. */
194  {
195  Int4 i;
196  Int4 s_off = s - subject->sequence;
197  for (i = 0; i < numhits; i++) {
198  offset_pairs[i + totalhits].qs_offsets.q_off = src[i];
199  offset_pairs[i + totalhits].qs_offsets.s_off = s_off;
200  }
201  }
202 
203  totalhits += numhits;
204  } else
205  /* not enough space in the destination array; return early */
206  {
207  s_range[1] = s - subject->sequence;
208  return totalhits;
209  }
210  }
211  } /* end for */
212  s_range[1] = s - subject->sequence;
213 
214  } /* end while */
215  /* if we get here, we fell off the end of the sequence */
216  return totalhits;
217 }
218 
219 /**
220  * Scans the subject sequence from "offset" to the end of the sequence,
221  * assuming a compressed protein alphabet
222  * Copies at most array_size hits.
223  * Returns the number of hits found.
224  * If there isn't enough room to copy all the hits, return early, and update
225  * "offset".
226  *
227  * @param lookup_wrap the lookup table [in]
228  * @param subject the subject sequence [in]
229  * @param offset the offset in the subject at which to begin scanning [in/out]
230  * @param offset_pairs Array to which hits will be copied [out]
231  * @param array_size length of the offset arrays [in]
232  * @return The number of hits found.
233  */
235  const LookupTableWrap * lookup_wrap,
236  const BLAST_SequenceBlk * subject,
237  BlastOffsetPair * NCBI_RESTRICT offset_pairs,
238  Int4 array_size,
239  Int4 * s_range)
240 {
241  Int4 index=0;
242  Int4 preshift; /* used for 2-stage index calculation */
243  Uint1 *s = NULL;
244  Uint1 *s_first = NULL;
245  Uint1 *s_last = NULL;
246  Int4 numhits = 0; /* number of hits found for one subject offset */
247  Int4 totalhits = 0; /* cumulative number of hits found */
248  PV_ARRAY_TYPE *pv;
249  Int4 pv_array_bts;
251 
252  Int4 word_length;
253  Int4 recip; /* reciprocal of compressed word size */
254  Int4* scaled_compress_table;
255  Int4 skip = 0; /* skip counter - how many letters left to skip*/
256  Uint1 next_char; /* prefetch variable */
257  Int4 compressed_char; /* translated letter */
258  Int4 compressed_alphabet_size;
259 
260  ASSERT(lookup_wrap->lut_type == eCompressedAaLookupTable);
261  lookup = (BlastCompressedAaLookupTable *) lookup_wrap->lut;
262  word_length = lookup->word_length;
263 
264  while (s_DetermineScanningOffsets(subject, word_length, word_length, s_range)) {
265  s_first=subject->sequence + s_range[1];
266  s_last=subject->sequence + s_range[2];
267 
268  compressed_alphabet_size = lookup->compressed_alphabet_size;
269  scaled_compress_table = lookup->scaled_compress_table;
270  recip = lookup->reciprocal_alphabet_size;
271  pv = lookup->pv;
272  pv_array_bts = lookup->pv_array_bts;
273 
274  /* prime the index */
275  for(s = s_first; s <= s_last; s++){
276  index = s_ComputeCompressedIndex(word_length - 1, s,
277  compressed_alphabet_size,
278  &skip, lookup);
279  if(!skip)
280  break;
281  }
282 
283  next_char = ((s <= s_last)? s[word_length-1] : 0);
284  preshift = (Int4)((((Int8)index) * recip) >> 32);
285 
286  /* main scanning loop */
287  for (; s <= s_last; s++) {
288  /* compute the index value */
289 
290  compressed_char = scaled_compress_table[next_char];
291  next_char = s[word_length];
292 
293  if(compressed_char < 0){ /* flush (rare) "bad" character(s) */
294  preshift = 0;
295  s++;
296  for(skip = word_length-1; skip && (s <= s_last) ; s++){
297  compressed_char = scaled_compress_table[next_char];
298  next_char = s[word_length];
299 
300  if(compressed_char < 0){ /* not again! */
301  skip = word_length-1;
302  preshift = 0;
303  continue;
304  }
305 
306  index = preshift + compressed_char;
307  preshift = (Int4)((((Int8)( index )) * recip) >> 32);
308  skip--;
309  }
310 
311  s--; /*undo the following increment*/
312  continue;
313  }
314 
315  /* we have to remove the oldest letter from the
316  index and add in the next letter. The latter is easy,
317  but since the compressed alphabet size is not a
318  power of two the former requires a remainder and
319  multiply, assuming the old letter is in the high part
320  of the index. For this reason, we reverse the order
321  of the letters and keep the oldest in the low part
322  of index, so that a single divide (implemented via
323  reciprocal multiplication) does the removal.
324  Index calculation done in two steps to let the CPU do
325  out-of-order execution. */
326 
327  index = preshift + compressed_char;
328  preshift = (Int4)((((Int8)( index )) * recip) >> 32);
329 
330  /* if there are hits */
331  if (PV_TEST(pv, index, pv_array_bts)) {
332  Int4 s_off = s - subject->sequence;
333 
334  CompressedLookupBackboneCell* backbone_cell =
335  lookup->backbone + index;
336 
337  numhits = backbone_cell->num_used;
338 
339  /* and there is enough space in the destination array */
340  if (numhits <= (array_size - totalhits)) {
341 
342  /* copy the hits to the destination */
343 
344  Int4 i;
345  Int4 *query_offsets;
346  BlastOffsetPair *dest = offset_pairs + totalhits;
347 
348  if (numhits <= COMPRESSED_HITS_PER_BACKBONE_CELL) {
349  /* hits all live in the backbone */
350 
351  query_offsets = backbone_cell->payload.query_offsets;
352  for (i = 0; i < numhits; i++) {
353  dest[i].qs_offsets.q_off = query_offsets[i];
354  dest[i].qs_offsets.s_off = s_off;
355  }
356  }
357  else {
358  /* hits are in the backbone cell and in the overflow list */
359  CompressedOverflowCell* curr_cell =
360  backbone_cell->payload.overflow_list.head;
361  /* we know the overflow list has at least one cell,
362  so it's safe to speculatively fetch the pointer
363  to further cells */
364  CompressedOverflowCell* next_cell = curr_cell->next;
365 
366  /* the number of hits in the linked list of cells has
367  1 added to it; the extra hit was spilled from the
368  backbone when the list was first created */
369  Int4 first_cell_entries = (numhits -
372 
373  /* copy hits from backbone */
374  query_offsets =
375  backbone_cell->payload.overflow_list.query_offsets;
376  for(i = 0; i < COMPRESSED_HITS_PER_BACKBONE_CELL - 1; i++) {
377  dest[i].qs_offsets.q_off = query_offsets[i];
378  dest[i].qs_offsets.s_off = s_off;
379  }
380 
381  /* handle the overflow list */
382 
383  /* first cell can be partially filled */
384  query_offsets = curr_cell->query_offsets;
385  dest += i;
386  for (i = 0; i < first_cell_entries; i++) {
387  dest[i].qs_offsets.q_off = query_offsets[i];
388  dest[i].qs_offsets.s_off = s_off;
389  }
390 
391  /* handle the rest of the list */
392 
393  if (next_cell != NULL) {
394  curr_cell = next_cell;
395  while (curr_cell != NULL) {
396  query_offsets = curr_cell->query_offsets;
397  curr_cell = curr_cell->next; /* prefetch */
398  dest += i;
399  for (i = 0; i < COMPRESSED_HITS_PER_OVERFLOW_CELL; i++) {
400  dest[i].qs_offsets.q_off = query_offsets[i];
401  dest[i].qs_offsets.s_off = s_off;
402  }
403  }
404  }
405  }
406 
407  totalhits += numhits;
408  }
409  else
410  /* not enough space in the destination array */
411  {
412  s_range[1] = s - subject->sequence;
413  return totalhits;
414  }
415  }
416  } /* end for */
417  s_range[1] = s - subject->sequence;
418  } /* end while */
419 
420  /* if we get here, we fell off the end of the sequence */
421  return totalhits;
422 }
423 
424 /** Add one query-subject pair to the list of such pairs retrieved
425  * from the RPS blast lookup table.
426  * @param b the List in which the current pair will be placed [in/out]
427  * @param q_off query offset [in]
428  * @param s_off subject offset [in]
429  */
430 static void s_AddToRPSBucket(RPSBucket * b, Uint4 q_off, Uint4 s_off)
431 {
432  BlastOffsetPair *offset_pairs = b->offset_pairs;
433  Int4 i = b->num_filled;
434  if (i == b->num_alloc) {
435  b->num_alloc *= 2;
436  offset_pairs = b->offset_pairs =
437  (BlastOffsetPair *) realloc(b->offset_pairs,
438  b->num_alloc *
439  sizeof(BlastOffsetPair));
440  }
441  offset_pairs[i].qs_offsets.q_off = q_off;
442  offset_pairs[i].qs_offsets.s_off = s_off;
443  b->num_filled++;
444 }
445 
446 /**
447  * Scans the RPS query sequence from "offset" to the end of the sequence.
448  * Copies at most array_size hits.
449  * Returns the number of hits found.
450  * If there isn't enough room to copy all the hits, return early, and update
451  * "offset".
452  *
453  * @param lookup_wrap the lookup table [in]
454  * @param sequence the subject sequence [in]
455  * @param offset the offset in the subject at which to begin scanning [in/out]
456  * @return The number of hits found.
457  */
459  const BLAST_SequenceBlk * sequence,
460  Int4 * offset)
461 {
462  Int4 index;
463  Int4 table_correction;
464  Uint1 *s = NULL;
465  Uint1 *abs_start = sequence->sequence;
466  Uint1 *s_first = NULL;
467  Uint1 *s_last = NULL;
468  Int4 numhits = 0; /* number of hits found for a given subject
469  offset */
470  Int4 totalhits = 0; /* cumulative number of hits found */
472  RPSBackboneCell *cell;
473  RPSBucket *bucket_array;
474  PV_ARRAY_TYPE *pv;
475  /* Buffer a large number of hits at once. The number of hits is
476  independent of the search, because the structures that will contain
477  them grow dynamically. A large number is needed because cache reuse
478  requires that many hits to the same neighborhood of the concatenated
479  database are available at any given time */
480  const Int4 max_hits = 4000000;
481 
482  ASSERT(lookup_wrap->lut_type == eRPSLookupTable);
483  lookup = (BlastRPSLookupTable *) lookup_wrap->lut;
484  bucket_array = lookup->bucket_array;
485 
486  /* empty the previous collection of hits */
487 
488  for (index = 0; index < lookup->num_buckets; index++)
489  bucket_array[index].num_filled = 0;
490 
491  s_first = abs_start + *offset;
492  s_last = abs_start + sequence->length - lookup->wordsize;
493  pv = lookup->pv;
494 
495  /* Calling code expects the returned sequence offsets to refer to the
496  *first letter* in a word. The legacy RPS blast lookup table stores
497  offsets to the *last* letter in each word, and so a correction is
498  needed */
499 
500  table_correction = lookup->wordsize - 1;
501 
502  /* prime the index */
503  index = ComputeTableIndex(lookup->wordsize - 1,
504  lookup->charsize, s_first);
505 
506  for (s = s_first; s <= s_last; s++) {
507  /* compute the index value */
508  index = ComputeTableIndexIncremental(lookup->wordsize,
509  lookup->charsize,
510  lookup->mask, s, index);
511 
512  /* if there are hits... */
513  if (PV_TEST(pv, index, PV_ARRAY_BTS)) {
514  cell = &lookup->rps_backbone[index];
515  numhits = cell->num_used;
516 
517  ASSERT(numhits != 0);
518 
519  if (numhits <= (max_hits - totalhits)) {
520  Int4 *src;
521  Int4 i;
522  Uint4 q_off;
523  Uint4 s_off = s - abs_start;
524  if (numhits <= RPS_HITS_PER_CELL) {
525  for (i = 0; i < numhits; i++) {
526  q_off = cell->entries[i] - table_correction;
527  s_AddToRPSBucket(bucket_array +
528  q_off / RPS_BUCKET_SIZE, q_off,
529  s_off);
530  }
531  } else {
532  /* hits (past the first) live in overflow array */
533  src =
534  lookup->overflow + (cell->entries[1] / sizeof(Int4));
535  q_off = cell->entries[0] - table_correction;
536  s_AddToRPSBucket(bucket_array + q_off / RPS_BUCKET_SIZE,
537  q_off, s_off);
538  for (i = 0; i < (numhits - 1); i++) {
539  q_off = src[i] - table_correction;
540  s_AddToRPSBucket(bucket_array +
541  q_off / RPS_BUCKET_SIZE, q_off,
542  s_off);
543  }
544  }
545 
546  totalhits += numhits;
547  } else
548  /* not enough space in the destination array; return early */
549  {
550  break;
551  }
552  }
553  }
554 
555  /* if we get here, we fell off the end of the sequence */
556  *offset = s - abs_start;
557 
558  return totalhits;
559 }
560 
562 {
563  if (lookup_wrap->lut_type == eAaLookupTable) {
564  BlastAaLookupTable *lut = (BlastAaLookupTable *)(lookup_wrap->lut);
565  /* normal backbone */
566  if(lut->bone_type == eBackbone)
567  lut->scansub_callback = (void *)s_BlastAaScanSubject;
568  /* small bone*/
569  else
571  }
572  else if (lookup_wrap->lut_type == eCompressedAaLookupTable) {
574  (BlastCompressedAaLookupTable *)(lookup_wrap->lut);
576  }
577 }
static NCBI_INLINE Int4 ComputeTableIndexIncremental(Int4 wordsize, Int4 charsize, Int4 mask, const Uint1 *word, Int4 index)
Given a word, compute its index value, reusing a previously computed index value. ...
Definition: blast_lookup.h:121
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins ...
RPSBackboneCell * rps_backbone
the lookup table used for RPS blast
Uint4 q_off
Query offset.
Definition: blast_def.h:143
static Int4 s_BlastCompressedAaScanSubject(const LookupTableWrap *lookup_wrap, const BLAST_SequenceBlk *subject, BlastOffsetPair *NCBI_RESTRICT offset_pairs, Int4 array_size, Int4 *s_range)
Scans the subject sequence from "offset" to the end of the sequence, assuming a compressed protein al...
Definition: blast_aascan.c:234
structure for hashtable of indexed query offsets
CompressedMixedOffsets overflow_list
storage for remote query offsets
structure used for bucket sorting offsets retrieved from the RPS blast lookup table.
ELookupTableType lut_type
What kind of a lookup table it is?
Definition: lookup_wrap.h:51
Int4 num_used
number of hits stored for this cell
RPSBucket * bucket_array
list of buckets
The basic lookup table structure for RPS blast searches.
int offset
signed int Int4
Alias for signed int.
Definition: ncbitype.h:120
unsigned int Uint4
Alias for unsigned int.
Definition: ncbitype.h:121
#define PV_ARRAY_TYPE
The pv_array 'native' type.
Definition: blast_lookup.h:41
#define RPS_BUCKET_SIZE
The number of regions into which the concatenated RPS blast database is split via bucket sorting...
static Int4 s_BlastSmallAaScanSubject(const LookupTableWrap *lookup_wrap, const BLAST_SequenceBlk *subject, BlastOffsetPair *NCBI_RESTRICT offset_pairs, Int4 array_size, Int4 *s_range)
same function for small lookup table
Definition: blast_aascan.c:134
#define RPS_HITS_PER_CELL
maximum number of hits in an RPS backbone cell; this may be redundant (have the same value as AA_HITS...
Int4 charsize
number of bits for a base/residue
CompressedLookupBackboneCell * backbone
hashtable for storing indexed query offsets
structure defining one cell of the RPS lookup table
void * lut
Pointer to the actual lookup table structure.
Definition: lookup_wrap.h:52
static NCBI_INLINE Int4 ComputeTableIndex(Int4 wordsize, Int4 charsize, const Uint1 *word)
Given a word, compute its index value from scratch.
Definition: blast_lookup.h:96
standard protein (blastp) lookup table
Int4 word_length
Length in letters of the full word match required to trigger extension.
Uint2 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
for(len=0;yy_str[len];++len)
structure defining one cell of the small (i.e., use short) lookup table
Int4 pv_array_bts
bit-to-shift value for PV array indicies
#define ASSERT
macro for assert.
Definition: ncbi_std.h:105
signed NCBI_INT8_TYPE Int8
Signed 8 byte sized integer.
Definition: ncbitype.h:143
Int4 mask
part of index to mask off, that is, top (wordsize*charsize) bits should be discarded.
#define NULL
Definition: ncbistd.hpp:225
static NCBI_INLINE Int4 s_ComputeCompressedIndex(Int4 wordsize, const Uint1 *word, Int4 compressed_alphabet_size, Int4 *skip, BlastCompressedAaLookupTable *lookup)
Convert a word to use a compressed alphabet.
Int4 num_filled
number of offset pairs currently in bucket
#define PV_TEST(lookup, index, shift)
Test the bit at position 'index' in the PV array bitfield within 'lookup'.
Definition: blast_lookup.h:55
CompressedOverflowCell * head
head of linked list of cells of query offsets stored off the backbone
union AaLookupSmallboneCell::@4 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Int4 query_offsets[3]
storage for query offsets local to the backbone cell
int i
void BlastChooseProteinScanSubject(LookupTableWrap *lookup_wrap)
Choose the most appropriate function to scan through protein subject sequences.
Definition: blast_aascan.c:561
PV_ARRAY_TYPE * pv
Presence vector bitfield; bit positions that are set indicate that the corresponding thick backbone c...
static NCBI_INLINE Boolean s_DetermineScanningOffsets(const BLAST_SequenceBlk *subject, Int4 word_length, Int4 lut_word_length, Int4 *range)
Determines the scanner's offsets taking the database masking restrictions into account (if any)...
Definition: masksubj.inl:43
Int4 word_length
Length in letters of the full word match required to trigger extension.
Int4 num_used
number of hits stored for this cell
unsigned char Uint1
Alias for unsigned char.
Definition: ncbitype.h:117
Int4 wordsize
number of full bytes in a full word
#define PV_ARRAY_BTS
bits-to-shift from lookup_index to pv_array index.
Definition: blast_lookup.h:43
void * scansub_callback
function for scanning subject sequences
static void s_AddToRPSBucket(RPSBucket *b, Uint4 q_off, Uint4 s_off)
Add one query-subject pair to the list of such pairs retrieved from the RPS blast lookup table...
Definition: blast_aascan.c:430
This symbol enables the verbose option in makeblastdb and other BLAST+ search command line applicatio...
Definition: blast_def.h:141
Int4 * scaled_compress_table
scaled version of compress_table
Int4 mask
part of index to mask off, that is, top (wordsize*charsize) bits should be discarded.
Int4 num_used
number of hits in this cell
Int4 * overflow
the overflow array for the compacted lookup table
void * scansub_callback
function for scanning subject sequences
Int4 BlastRPSScanSubject(const LookupTableWrap *lookup_wrap, const BLAST_SequenceBlk *sequence, Int4 *offset)
Scans the RPS query sequence from "offset" to the end of the sequence.
Definition: blast_aascan.c:458
Int4 num_buckets
number of buckets used to sort offsets retrieved from the lookup table
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins ...
void * overflow
may point to Int4 or Uint2, the overflow array for the compacted lookup table
union CompressedLookupBackboneCell::@5 payload
structure for holding the list of query offsets
Int4 compressed_alphabet_size
letters in the compressed alphabet
struct CompressedOverflowCell * next
pointer to next cell
Int4 query_offsets[4]
the query offsets stored in the cell
#define NCBI_RESTRICT
For some reason, ICC claims a suitable __STDC_VERSION__ but then barfs on restrict.
Definition: ncbi_std.h:63
structure defining one cell of the compacted lookup table
Int4 num_alloc
max number of offset pairs bucket can hold
Uint4 s_off
Subject offset.
Definition: blast_def.h:144
Int4 reciprocal_alphabet_size
2^32 / compressed_alphabet_size
static string subject
Int4 charsize
number of bits for a base/residue
The basic lookup table structure for blastp searches.
static Int4 s_BlastAaScanSubject(const LookupTableWrap *lookup_wrap, const BLAST_SequenceBlk *subject, BlastOffsetPair *NCBI_RESTRICT offset_pairs, Int4 array_size, Int4 *s_range)
Scans the subject sequence from "offset" to the end of the sequence.
Definition: blast_aascan.c:48
while(yy_chk[yy_base[yy_current_state]+yy_c]!=yy_current_state)
Routines for creating protein BLAST lookup tables.
Wrapper structure for different types of BLAST lookup tables.
Definition: lookup_wrap.h:50
cell in list for holding query offsets
struct BlastOffsetPair::@6 qs_offsets
Query/subject offset pair.
Routines for creating protein BLAST lookup tables.
PV_ARRAY_TYPE * pv
Presence vector bitfield; bit positions that are set indicate that the corresponding thick backbone c...
Int4 entries[3]
if the number of hits in this cell is RPS_HITS_PER_CELL or less, all hits go into this array...
Int4 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
union AaLookupBackboneCell::@3 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
PV_ARRAY_TYPE * pv
Presence vector bitfield; bit positions that are set indicate that the corresponding thick backbone c...
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
#define COMPRESSED_HITS_PER_OVERFLOW_CELL
number of query offsets to store in an overflow cell
Structure to hold a sequence.
Definition: blast_def.h:242
compressed alphabet (blastp) lookup table
#define AA_HITS_PER_CELL
maximum number of hits in one lookup table cell
unsigned short Uint2
Alias for unsigned short.
Definition: ncbitype.h:119
void * thick_backbone
may point to BackboneCell, SmallboneCell, or TinyboneCell.
static int lookup(const char *name, const struct lookup_int *table)
Definition: attributes.c:50
RPS lookup table (rpsblast and rpstblastn)
Int4 query_offsets[3-1]
the query offsets stored locally
EBoneType bone_type
type of bone used: 0: normal backbone (using Int4) 1: small backbone (using Uint2) will be determined...
#define COMPRESSED_HITS_PER_BACKBONE_CELL
number of query offsets to store in a backbone cell
Int4 length
Length of sequence.
Definition: blast_def.h:246
The lookup table structure for protein searches using a compressed alphabet.
Uint2 num_used
number of hits stored for this cell
BlastOffsetPair * offset_pairs
list of offset pairs
Modified on Thu Oct 18 11:30:11 2018 by modify_doxy.py rev. 546573