NCBI C++ ToolKit
blast_hits.h
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /* $Id: blast_hits.h 62393 2014-04-04 14:15:53Z fongah2 $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Ilya Dondoshansky
00027  *
00028  */
00029 
00030 /** @file blast_hits.h
00031  * Structures and API used for saving BLAST hits
00032  */
00033 
00034 #ifndef ALGO_BLAST_CORE__BLAST_HITS__H
00035 #define ALGO_BLAST_CORE__BLAST_HITS__H
00036 
00037 #include <algo/blast/core/ncbi_std.h>
00038 #include <algo/blast/core/blast_export.h>
00039 #include <algo/blast/core/blast_program.h>
00040 #include <algo/blast/core/blast_query_info.h>
00041 #include <algo/blast/core/blast_options.h>
00042 #include <algo/blast/core/blast_parameters.h>
00043 #include <algo/blast/core/blast_stat.h>
00044 #include <algo/blast/core/gapinfo.h>
00045 #include <algo/blast/core/blast_seqsrc.h>
00046 #include <algo/blast/core/pattern.h>
00047 
00048 #ifdef __cplusplus
00049 extern "C" {
00050 #endif
00051 
00052 /** Keeps prelim_hitlist_size and HitSavingOptions
00053     together, mostly for use by hspstream. */
00054 typedef struct SBlastHitsParameters {
00055    Int4 prelim_hitlist_size; /**< number of hits saved during preliminary 
00056                            part of search. */
00057    Int4 hsp_num_max; /**< number of HSPs to save per db sequence. */
00058 } SBlastHitsParameters; 
00059 
00060 /** Sets up small structures used by blast_hit.c for saving HSPs.
00061  * @param hit_options field hitlist_size and hsp_num_max needed, a pointer to 
00062  *      this structure will be stored on resulting structure.[in]
00063  * @param ext_options field compositionBasedStats needed here. [in]
00064  * @param scoring_options gapped_calculation needed here. [in]
00065  * @param retval the allocated SBlastHitsParameters*
00066  * @return zero on success, 1 on NULL parameter, 2 if calloc fails.
00067  */
00068 NCBI_XBLAST_EXPORT
00069 Int2 SBlastHitsParametersNew(const BlastHitSavingOptions* hit_options,
00070                              const BlastExtensionOptions* ext_options,
00071                              const BlastScoringOptions* scoring_options,
00072                              SBlastHitsParameters* *retval);
00073 
00074 /** Make a deep copy of the SBlastHitsParameters structure passed in
00075  * @param hit_params source hit parameters structure [in]
00076  * @return NULL if out of memory, otherwise deep copy of first argument
00077  */
00078 NCBI_XBLAST_EXPORT
00079 SBlastHitsParameters* 
00080 SBlastHitsParametersDup(const SBlastHitsParameters* hit_params);
00081 
00082 /** Deallocated SBlastHitsParameters.
00083  * @param param object to be freed.
00084  * @return NULL pointer.
00085  */
00086 NCBI_XBLAST_EXPORT
00087 SBlastHitsParameters* SBlastHitsParametersFree(SBlastHitsParameters* param);
00088                    
00089 
00090 
00091 
00092 /** One sequence segment within an HSP */
00093 typedef struct BlastSeg {
00094    Int2 frame;  /**< Translation frame */
00095    Int4 offset; /**< Start of hsp */
00096    Int4 end;    /**< End of hsp */
00097    Int4 gapped_start;/**< Where the gapped extension started. */
00098 } BlastSeg;
00099 
00100 /** In PHI BLAST: information about pattern match in a given HSP. */
00101 typedef struct SPHIHspInfo {
00102     Int4 index; /**< Index of query pattern occurrence for this HSP. */ 
00103     Int4 length; /**< Length of this pattern occurrence in subject. */
00104 } SPHIHspInfo;
00105 
00106 /** Structure holding all information about an HSP */
00107 typedef struct BlastHSP {
00108    Int4 score;           /**< This HSP's raw score */
00109    Int4 num_ident;       /**< Number of identical base pairs in this HSP */
00110    double bit_score;     /**< Bit score, calculated from score */
00111    double evalue;        /**< This HSP's e-value */
00112    BlastSeg query;       /**< Query sequence info. */
00113    BlastSeg subject;     /**< Subject sequence info. */
00114    Int4     context;     /**< Context number of query */
00115    GapEditScript* gap_info;/**< ALL gapped alignment is here */
00116    Int4 num;             /**< How many HSP's are linked together for sum 
00117                               statistics evaluation? If unset (0), this HSP is
00118                               not part of a linked set, i.e. value 0 is treated
00119                               the same way as 1. */
00120    Int2     comp_adjustment_method;  /**< which mode of composition
00121                                               adjustment was used; relevant
00122                                               only for blastp and tblastn */
00123    SPHIHspInfo* pat_info; /**< In PHI BLAST, information about this pattern
00124                                  match. */
00125    Int4 num_positives;
00126 } BlastHSP;
00127 
00128 /** The structure to hold all HSPs for a given sequence after the gapped 
00129  *  alignment.
00130  */
00131 typedef struct BlastHSPList {
00132    Int4 oid;/**< The ordinal id of the subject sequence this HSP list is for */
00133    Int4 query_index; /**< Index of the query which this HSPList corresponds to.
00134                         Set to 0 if not applicable */
00135    BlastHSP** hsp_array; /**< Array of pointers to individual HSPs */
00136    Int4 hspcnt; /**< Number of HSPs saved */
00137    Int4 allocated; /**< The allocated size of the hsp_array */
00138    Int4 hsp_max; /**< The maximal number of HSPs allowed to be saved */
00139    Boolean do_not_reallocate; /**< Is reallocation of the hsp_array allowed? */
00140    double best_evalue; /**< Smallest e-value for HSPs in this list. Filled after 
00141                           e-values are calculated. Necessary because HSPs are
00142                           sorted by score, but highest scoring HSP may not have
00143                           the lowest e-value if sum statistics is used. */
00144 } BlastHSPList;
00145 
00146 /** The structure to contain all BLAST results for one query sequence */
00147 typedef struct BlastHitList {
00148    Int4 hsplist_count; /**< Filled size of the HSP lists array */
00149    Int4 hsplist_max; /**< Maximal allowed size of the HSP lists array */
00150    double worst_evalue; /**< Highest of the best e-values among the HSP 
00151                            lists */
00152    Int4 low_score; /**< The lowest of the best scores among the HSP lists */
00153    Boolean heapified; /**< Is this hit list already heapified? */
00154    BlastHSPList** hsplist_array; /**< Array of HSP lists for individual
00155                                           database hits */
00156    Int4 hsplist_current; /**< Number of allocated HSP list arrays. */
00157 } BlastHitList;
00158 
00159 /** The structure to contain all BLAST results, for multiple queries */
00160 typedef struct BlastHSPResults {
00161    Int4 num_queries; /**< Number of query sequences */
00162    BlastHitList** hitlist_array; /**< Array of results for individual
00163                                           query sequences */
00164 } BlastHSPResults;
00165 
00166 
00167 /** By how much should the chunks of a subject sequence overlap if it is 
00168     too long and has to be split */
00169 #define DBSEQ_CHUNK_OVERLAP 100
00170 
00171 /********************************************************************************
00172 
00173 The following section has four sets of functions (or "APIs"), manipulating with
00174 the following structures:
00175 1. BlastHSP, which is the basic unit to record one alignment.  
00176 2. BlastHSPList, which is a list of BlastHSP's for one database sequence. 
00177 3. BlastHitList, which contains all HSPList's for a given query. 
00178 4. BlastHSPResults, which is a list of BlastHitList's for multiple queries.
00179 
00180  The naming conventions for the functions are the following:
00181 
00182 1.) All routines start with "Blast_"
00183 
00184 2.) After "Blast_" comes the structure being manipulated, that should be either 
00185     HSP (all capitals all the time!), HSPList (exactly this capitalization), 
00186     HitList (capital H and L, all others lower-case), or HSPResults.
00187 
00188 3.) finally the task being done, e.g., "Free", "New", "Init".
00189 
00190 ********************************************************************************/
00191 /********************************************************************************
00192           HSP API
00193 ********************************************************************************/
00194 
00195 /** Deallocate memory for an HSP structure */
00196 NCBI_XBLAST_EXPORT
00197 BlastHSP* Blast_HSPFree(BlastHSP* hsp);
00198 
00199 /** Allocate and zeros out memory for an HSP structure */
00200 NCBI_XBLAST_EXPORT
00201 BlastHSP* Blast_HSPNew(void);
00202 
00203 /** Allocates BlastHSP and inits with information from input.
00204  * structure.
00205  * @param query_start Start of query alignment [in]
00206  * @param query_end End of query alignment [in]
00207  * @param subject_start Start of subject alignment [in]
00208  * @param subject_end End of subject alignment [in]
00209  * @param query_gapped_start Where gapped alignment started on query [in]
00210  * @param subject_gapped_start Where gapped alignment started on subject [in]
00211  * @param query_context The index of the query containing this HSP [in]
00212  * @param query_frame Query frame: -3..3 for translated sequence, 
00213  *        1 or -1 for blastn, 0 for blastp [in]
00214  * @param subject_frame Subject frame: -3..3 for translated sequence, 
00215  *        1 for blastn, 0 for blastp [in]
00216  * @param score score of alignment [in]
00217  * @param gap_edit Will be transferred to HSP and nulled out 
00218  *    if a traceback was not calculated may be NULL [in] [out]
00219  * @param ret_hsp allocated and filled in BlastHSP [out]
00220  */
00221 NCBI_XBLAST_EXPORT
00222 Int2
00223 Blast_HSPInit(Int4 query_start, Int4 query_end, 
00224               Int4 subject_start, Int4 subject_end, 
00225               Int4 query_gapped_start, Int4 subject_gapped_start, 
00226               Int4 query_context, Int2 query_frame, Int2 subject_frame,
00227               Int4 score, GapEditScript* *gap_edit, BlastHSP** ret_hsp);
00228 
00229 /** Reevaluate the HSP's score and percent identity after taking
00230  * into account the ambiguity information. Used only for blastn after a greedy
00231  * gapped extension with traceback. This function can remove part of the 
00232  * alignment at either end, if its score becomes negative after reevaluation.
00233  * Traceback is also adjusted in that case.
00234  * @param hsp The HSP structure [in] [out]
00235  * @param query_start Pointer to the start of the query sequence [in]
00236  * @param query_length Length of the query sequence [in]
00237  * @param subject_start Pointer to the start of the subject sequence [in]
00238  * @param subject_length Length of the subject sequence [in]
00239  * @param hit_params Hit saving parameters containing score cut-off [in]
00240  * @param score_params Scoring parameters [in]
00241  * @param sbp Score block with Karlin-Altschul parameters [in]
00242  * @return Should this HSP be deleted after the score reevaluation?
00243  */
00244 NCBI_XBLAST_EXPORT
00245 Boolean 
00246 Blast_HSPReevaluateWithAmbiguitiesGapped(BlastHSP* hsp, 
00247    const Uint1* query_start, const Int4 query_length, 
00248    const Uint1* subject_start, const Int4 subject_length,
00249    const BlastHitSavingParameters* hit_params, 
00250    const BlastScoringParameters* score_params, const BlastScoreBlk* sbp);
00251 
00252 /** Reevaluate the HSP's score and percent identity after taking into
00253  * account the ambiguity information. Used for ungapped searches with 
00254  * nucleotide database (blastn, tblastn, tblastx).
00255  * @param hsp The HSP structure [in] [out]
00256  * @param query_start Pointer to the start of the query sequence [in]
00257  * @param subject_start Pointer to the start of the subject sequence [in]
00258  * @param word_params Initial word parameters with ungapped cutoff score [in]
00259  * @param sbp Score block with Karlin-Altschul parameters [in]
00260  * @param translated Are sequences protein (with a translated subject)? [in]
00261  * @return Should this HSP be deleted after the score reevaluation?
00262  */
00263 NCBI_XBLAST_EXPORT
00264 Boolean 
00265 Blast_HSPReevaluateWithAmbiguitiesUngapped(BlastHSP* hsp, 
00266    const Uint1* query_start, const Uint1* subject_start,
00267    const BlastInitialWordParameters* word_params, 
00268    BlastScoreBlk* sbp, Boolean translated);
00269 
00270 /** Calculate number of identities in an HSP and set the BlastHSP::num_ident
00271  * field (unconditionally)
00272  * @param query The query sequence [in]
00273  * @param subject The uncompressed subject sequence [in]
00274  * @param hsp All information about the HSP, the output of this function will
00275  * be stored in its num_ident field [in|out]
00276  * @param score_options Scoring options [in]
00277  * @param align_length_ptr The alignment length, including gaps (optional) [out]
00278  * @return 0 on success, -1 on invalid parameters or error
00279  */
00280 NCBI_XBLAST_EXPORT
00281 Int2
00282 Blast_HSPGetNumIdentities(const Uint1* query, 
00283                           const Uint1* subject, 
00284                           BlastHSP* hsp, 
00285                           const BlastScoringOptions* score_options,
00286                           Int4* align_length_ptr);
00287 
00288 /** Calculate number of identities and positives in an HSP and set the
00289  *  BlastHSP::num_ident  and BlastHSP::num_positives fields
00290  * @param query The query sequence [in]
00291  * @param subject The uncompressed subject sequence [in]
00292  * @param hsp All information about the HSP, the output of this function will
00293  * be stored in its num_ident field [in|out]
00294  * @param score_options Scoring options [in]
00295  * @param align_length_ptr The alignment length, including gaps (optional) [out]
00296  * @param sbp Score blk containing the matrix for counting positives [in]
00297  * @return 0 on success, -1 on invalid parameters or error
00298  */
00299 NCBI_XBLAST_EXPORT
00300 Int2
00301 Blast_HSPGetNumIdentitiesAndPositives(const Uint1* query,
00302                                       const Uint1* subject,
00303                                       BlastHSP* hsp,
00304                                       const BlastScoringOptions* score_options,
00305                                       Int4* align_length_ptr,
00306                                       const BlastScoreBlk* sbp);
00307 
00308 /** Determines whether this HSP should be kept or
00309  * deleted.
00310  * @param hsp An HSP structure [in] [out]
00311  * @param hit_options Hit saving options containing percent identity and
00312  *                    HSP length thresholds.
00313  * @param align_length alignment length including gaps
00314  * @return FALSE if HSP passes the test, TRUE if it should be deleted.
00315  */
00316 NCBI_XBLAST_EXPORT
00317 Boolean
00318 Blast_HSPTest(BlastHSP* hsp,
00319               const BlastHitSavingOptions* hit_options,
00320               Int4 align_length);
00321 
00322 /** Calculates number of identities and alignment lengths of an HSP via
00323  * Blast_HSPGetNumIdentities and determines whether this HSP should be kept or
00324  * deleted. 
00325  * @param program_number Type of BLAST program [in]
00326  * @param hsp An HSP structure [in] [out]
00327  * @param query Query sequence [in]
00328  * @param subject Subject sequence [in]
00329  * @param score_options Scoring options, needed to distinguish the 
00330  *                      out-of-frame case. [in]
00331  * @param hit_options Hit saving options containing percent identity and
00332  *                    HSP length thresholds.
00333  * @return FALSE if HSP passes the test, TRUE if it should be deleted.
00334  */ 
00335 NCBI_XBLAST_EXPORT
00336 Boolean
00337 Blast_HSPTestIdentityAndLength(EBlastProgramType program_number, 
00338                                BlastHSP* hsp, const Uint1* query, const Uint1* subject, 
00339                                const BlastScoringOptions* score_options,
00340                                const BlastHitSavingOptions* hit_options);
00341 
00342 /** Calculate query coverage percentage of an hsp
00343  *  @param hsp An HSP structure [in]
00344  *  @param query_length Length of query [in]
00345  *  @return percentage query coverage of the input hsp
00346  */
00347 NCBI_XBLAST_EXPORT
00348 double
00349 Blast_HSPGetQueryCoverage(const BlastHSP* hsp, Int4 query_length);
00350 
00351 /** Calculate query coverage percentage of an hsp
00352  *  @param hsp An HSP structure [in]
00353  *  @param min_query_coverage_pct Min query coverage pct for saving the hsp[in]
00354  *  @param query_length Length of query [in]
00355  *  @return true if hsp's query coverage pct  < min_query_coverage_pct (delete hsp)
00356  */
00357 NCBI_XBLAST_EXPORT
00358 Boolean Blast_HSPQueryCoverageTest(BlastHSP* hsp,
00359                                    double min_query_coverage_pct,
00360                                    Int4 query_length);
00361 
00362 /** Calculated the number of HSPs that should be saved.
00363  * @param gapped_calculation ungapped if false [in]
00364  * @param options HitSavingoptions object [in]
00365  * @return number of HSPs to save. 
00366  */
00367 NCBI_XBLAST_EXPORT
00368 Int4
00369 BlastHspNumMax(Boolean gapped_calculation, const BlastHitSavingOptions* options);
00370 
00371 /** Calculate length of an HSP as length in query plus length of gaps in 
00372  * query. If gap information is unavailable, return maximum between length in
00373  * query and in subject.
00374  * @param hsp An HSP structure [in]
00375  * @param length Length of this HSP [out]
00376  * @param gaps Total number of gaps in this HSP [out]
00377  * @param gap_opens Number of gap openings in this HSP [out] 
00378  */
00379 NCBI_XBLAST_EXPORT
00380 void Blast_HSPCalcLengthAndGaps(const BlastHSP* hsp, Int4* length,
00381                                 Int4* gaps, Int4* gap_opens);
00382 
00383 /** Adjust HSP endpoint offsets according to strand/frame; return values in
00384  * 1-offset coordinates instead of internal 0-offset.
00385  * @param program Type of BLAST program [in]
00386  * @param hsp An HSP structure [in]
00387  * @param query_length Length of query [in]
00388  * @param subject_length Length of subject [in]
00389  * @param q_start Start of alignment in query [out]
00390  * @param q_end End of alignment in query [out]
00391  * @param s_start Start of alignment in subject [out]
00392  * @param s_end End of alignment in subject [out]
00393  */
00394 NCBI_XBLAST_EXPORT
00395 void 
00396 Blast_HSPGetAdjustedOffsets(EBlastProgramType program, BlastHSP* hsp, 
00397                             Int4 query_length, Int4 subject_length, 
00398                             Int4* q_start, Int4* q_end,
00399                             Int4* s_start, Int4* s_end);
00400 
00401 /** Performs the translation and coordinates adjustment, if only part of the 
00402  * subject sequence is translated for gapped alignment. 
00403  * @param subject_blk Subject sequence structure [in]
00404  * @param hsp The HSP information [in] [out]
00405  * @param is_ooframe Return a mixed-frame sequence if TRUE [in]
00406  * @param gen_code_string Database genetic code [in]
00407  * @param translation_buffer_ptr Pointer to buffer holding the translation [out]
00408  * @param subject_ptr Pointer to sequence to be passed to the gapped 
00409  *                    alignment [out]
00410  * @param subject_length_ptr Length of the translated sequence [out]
00411  * @param start_shift_ptr How far is the partial sequence shifted w.r.t. the 
00412  *                        full sequence. [out]
00413  */
00414 NCBI_XBLAST_EXPORT
00415 Int2
00416 Blast_HSPGetPartialSubjectTranslation(BLAST_SequenceBlk* subject_blk, 
00417    BlastHSP* hsp, Boolean is_ooframe, const Uint1* gen_code_string, 
00418    Uint1** translation_buffer_ptr, Uint1** subject_ptr, 
00419    Int4* subject_length_ptr, Int4* start_shift_ptr);
00420 
00421 /** Adjusts offsets if partial sequence was used for extension.
00422  * @param hsp The hit to work on [in][out]
00423  * @param start_shift amount of database sequence not used for extension. [in]
00424 */
00425 NCBI_XBLAST_EXPORT
00426 void
00427 Blast_HSPAdjustSubjectOffset(BlastHSP* hsp, Int4 start_shift);
00428 
00429 
00430 /** Returns a buffer with a protein translated from nucleotide.
00431  * @param target_t SBlastTargetTranslation* with information about translation [in]
00432  * @param hsp The hit to work on [in]
00433  * @param translated_length length of the protein sequence [in]
00434 */
00435 NCBI_XBLAST_EXPORT
00436 const Uint1*
00437 Blast_HSPGetTargetTranslation(SBlastTargetTranslation* target_t, const BlastHSP* hsp, Int4* translated_length);
00438 
00439 /********************************************************************************
00440           HSPList API
00441 ********************************************************************************/
00442 
00443 /** Deallocate memory for an HSP list structure 
00444  *  as well as all it's components.
00445  * @param hsp_list the BlastHSPList to be freed [in]. 
00446 */
00447 NCBI_XBLAST_EXPORT
00448 BlastHSPList* Blast_HSPListFree(BlastHSPList* hsp_list);
00449 
00450 /** Creates HSP list structure with a default size HSP array 
00451  * @param hsp_max the maximum number of HSP's that can ever be
00452  *    saved at once [in].
00453 */
00454 NCBI_XBLAST_EXPORT
00455 BlastHSPList* Blast_HSPListNew(Int4 hsp_max);
00456 
00457 /** Returns true if the BlastHSPList contains no HSPs
00458  * @param hsp_list list of HSPs to examine [in]
00459  */
00460 NCBI_XBLAST_EXPORT
00461 Boolean
00462 Blast_HSPList_IsEmpty(const BlastHSPList* hsp_list);
00463 
00464 /** Returns a duplicate (deep copy) of the given hsp list. */
00465 NCBI_XBLAST_EXPORT
00466 BlastHSPList* BlastHSPListDup(const BlastHSPList* hsp_list);
00467 
00468 /** Swaps the two HSP lists via structure assignment */
00469 NCBI_XBLAST_EXPORT
00470 void Blast_HSPListSwap(BlastHSPList* list1, BlastHSPList* list2);
00471 
00472 /** Saves HSP information into a BlastHSPList structure
00473  * @param hsp_list Structure holding all HSPs with full gapped alignment 
00474  *        information [in] [out]
00475  * @param hsp The new HSP to be inserted into the HSPList [in]
00476 */
00477 NCBI_XBLAST_EXPORT
00478 Int2
00479 Blast_HSPListSaveHSP(BlastHSPList* hsp_list, BlastHSP* hsp);
00480 
00481 /** Calculate the expected values for all HSPs in a hit list, without using 
00482  * the sum statistics. In case of multiple queries, the offsets are assumed 
00483  * to be already adjusted to individual query coordinates, and the contexts 
00484  * are set for each HSP.
00485  * @param program_number Type of BLAST program [in]
00486  * @param query_info Auxiliary query information - needed only for effective
00487  *                   search space calculation if it is not provided [in]
00488  * @param subject_length Subject length - needed for Spouge's new FSC [in]
00489  * @param hsp_list List of HSPs for one subject sequence [in] [out]
00490  * @param gapped_calculation Is this for a gapped or ungapped search? [in]
00491  * @param RPS_prelim Is this for a RPS preliminary search? [in]
00492  * @param sbp Structure containing statistical information [in]
00493  * @param gap_decay_rate Adjustment parameter to compensate for the effects of
00494  * performing multiple tests when linking HSPs. No adjustment is made if 0. [in]
00495  * @param scaling_factor Scaling factor by which Lambda should be divided. Used in
00496  *                       RPS BLAST only; should be set to 1.0 in other cases. [in]
00497  *                       
00498  */
00499 NCBI_XBLAST_EXPORT
00500 Int2 Blast_HSPListGetEvalues(EBlastProgramType program_number,
00501                              const BlastQueryInfo* query_info,
00502                              Int4 subject_length,
00503                              BlastHSPList* hsp_list,
00504                              Boolean gapped_calculation, 
00505                              Boolean RPS_prelim,
00506                              const BlastScoreBlk* sbp, double gap_decay_rate,
00507                              double scaling_factor);
00508 
00509 /** Calculate e-values for a PHI BLAST HSP list.
00510  * @param hsp_list HSP list found by PHI BLAST [in] [out]
00511  * @param sbp Scoring block with statistical parameters [in]
00512  * @param query_info Structure containing information about pattern counts [in]
00513  * @param pattern_blk Structure containing information about pattern hits in db [in]
00514  */
00515 NCBI_XBLAST_EXPORT
00516 void Blast_HSPListPHIGetEvalues(BlastHSPList* hsp_list, BlastScoreBlk* sbp, 
00517                                 const BlastQueryInfo* query_info,
00518                                 const SPHIPatternSearchBlk* pattern_blk);
00519 
00520 /** Calculate bit scores from raw scores in an HSP list.
00521  * @param hsp_list List of HSPs [in] [out]
00522  * @param gapped_calculation Is this a gapped search? [in]
00523  * @param sbp Scoring block with statistical parameters [in]
00524  */
00525 NCBI_XBLAST_EXPORT
00526 Int2 Blast_HSPListGetBitScores(BlastHSPList* hsp_list, 
00527                                Boolean gapped_calculation, 
00528                                const BlastScoreBlk* sbp);
00529 
00530 /** Calculate bit scores from raw scores in an HSP list for a PHI BLAST search.
00531  * @param hsp_list List of HSPs [in] [out]
00532  * @param sbp Scoring block with statistical parameters [in]
00533  */
00534 NCBI_XBLAST_EXPORT
00535 void Blast_HSPListPHIGetBitScores(BlastHSPList* hsp_list, BlastScoreBlk* sbp);
00536     
00537 /** Discard the HSPs above the e-value threshold from the HSP list 
00538  * @param hsp_list List of HSPs for one subject sequence [in] [out]
00539  * @param hit_options Options block containing the e-value cut-off [in]
00540 */
00541 NCBI_XBLAST_EXPORT
00542 Int2 Blast_HSPListReapByEvalue(BlastHSPList* hsp_list, 
00543                                const BlastHitSavingOptions* hit_options);
00544 
00545 /** Discard the HSPs above the raw threshold from the HSP list 
00546  * @param hsp_list List of HSPs for one subject sequence [in] [out]
00547  * @param hit_options Options block containing the e-value cut-off [in]
00548  * -RMH-
00549  */
00550 NCBI_XBLAST_EXPORT
00551 Int2 Blast_HSPListReapByRawScore(BlastHSPList* hsp_list,
00552                                const BlastHitSavingOptions* hit_options);
00553 
00554 /** Discard the HSPs below the min query coverage pct from the HSP list
00555  * @param hsp_list List of HSPs for one subject sequence [in] [out]
00556  * @param hit_options Options block containing the min query coverage pct [in]
00557  * @param query_info Structure containing information about the queries  [in]
00558  * @param program_number Type of BLAST program.
00559 */
00560 NCBI_XBLAST_EXPORT
00561 Int2 Blast_HSPListReapByQueryCoverage(BlastHSPList* hsp_list,
00562                                       const BlastHitSavingOptions* hit_options,
00563                                       const BlastQueryInfo* query_info,
00564                                       EBlastProgramType program_number);
00565 
00566 /** Cleans out the NULLed out HSP's from the HSP array that
00567  * is part of the BlastHSPList.
00568  * @param hsp_list Contains array of pointers to HSP structures [in]
00569  * @return status of function call.
00570 */
00571 NCBI_XBLAST_EXPORT
00572 Int2
00573 Blast_HSPListPurgeNullHSPs(BlastHSPList* hsp_list);
00574 
00575 /** Check for an overlap of two different alignments and remove redundant HSPs.
00576  * A sufficient overlap is when two alignments have the same start or end values
00577  * If an overlap is found the HSP with the lowest score is removed, if both scores
00578  * are the same then the first is removed.
00579  * @param program Type of BLAST program. For some programs (PHI BLAST), the
00580  *                purge should not be performed. [in]
00581  * @param hsp_list Contains array of pointers to HSPs to purge [in]
00582  * @param purge Should the hsp be purged? [in]
00583  * @return The number of valid alignments remaining. 
00584 */
00585 NCBI_XBLAST_EXPORT
00586 Int4
00587 Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program, 
00588                                           BlastHSPList* hsp_list,
00589                                           Boolean purge);
00590 
00591 /** Reevaluate all ungapped HSPs in an HSP list.  
00592  * This is only done for an ungapped search, or if traceback is 
00593  * already available.
00594  * Subject sequence is uncompressed and saved here (for nucleotide sequences). 
00595  * The number of identities is calculated for each HSP along the way,
00596  * hence this function is called for all programs. 
00597  * @param program Type of BLAST program [in]
00598  * @param hsp_list The list of HSPs for one subject sequence [in] [out]
00599  * @param query_blk The query sequence [in]
00600  * @param subject_blk The subject sequence [in] [out]
00601  * @param word_params Initial word parameters, containing ungapped cutoff 
00602  *                    score [in]
00603  * @param hit_params Hit saving parameters, including cutoff score [in]
00604  * @param query_info Auxiliary query information [in]
00605  * @param sbp The statistical information [in]
00606  * @param score_params Parameters related to scoring [in]
00607  * @param seq_src The BLAST database structure (for retrieving uncompressed
00608  *             sequence) [in]
00609  * @param gen_code_string Genetic code string in case of a translated 
00610  *                        database search. [in]
00611  */
00612 NCBI_XBLAST_EXPORT
00613 Int2 
00614 Blast_HSPListReevaluateUngapped(EBlastProgramType program, 
00615    BlastHSPList* hsp_list, BLAST_SequenceBlk* query_blk, 
00616    BLAST_SequenceBlk* subject_blk, 
00617    const BlastInitialWordParameters* word_params,
00618    const BlastHitSavingParameters* hit_params, const BlastQueryInfo* query_info, 
00619    BlastScoreBlk* sbp, const BlastScoringParameters* score_params, 
00620    const BlastSeqSrc* seq_src, const Uint1* gen_code_string);
00621 
00622 /** Append one HSP list to the other. Discard lower scoring HSPs if there is
00623  * not enough space to keep all.
00624  * @param old_hsp_list_ptr list of HSPs, will be NULLed out on return [in|out]
00625  * @param combined_hsp_list_ptr Pointer to the combined list of HSPs, possibly
00626  *                              containing previously saved HSPs [in] [out]
00627  * @param hsp_num_max Maximal allowed number of HSPs to save (unlimited if INT4_MAX) [in]
00628  * @return Status: 0 on success, -1 on failure.
00629  */ 
00630 NCBI_XBLAST_EXPORT
00631 Int2 Blast_HSPListAppend(BlastHSPList** old_hsp_list_ptr,
00632         BlastHSPList** combined_hsp_list_ptr, Int4 hsp_num_max);
00633 
00634 /** Merge an HSP list from a chunk of the subject sequence into a previously
00635  * computed HSP list.
00636  * @param hsp_list Contains HSPs from the new chunk [in]
00637  * @param combined_hsp_list_ptr Contains HSPs from previous chunks [in] [out]
00638  * @param hsp_num_max Maximal allowed number of HSPs to save (unlimited if INT4_MAX) [in]
00639  * @param split_points Offset The sequence offset (query or subject) that is 
00640  *             the boundary between HSPs in combined_hsp_list and hsp_list. [in]
00641  * @param contexts_per_query If positive, the number of query contexts
00642  *                    that hits can contain. If negative, the (one) split
00643  *                    point occurs on the subject sequence [in]
00644  * @param chunk_overlap_size The length of the overlap region between the
00645  *                    sequence region containing hsp_list and that
00646  *                    containing combined_hsp_list [in]
00647  * @param allow_gap Allow merging HSPs at different diagonals [in]
00648  * @return 0 if HSP lists have been merged successfully, -1 otherwise.
00649  */
00650 NCBI_XBLAST_EXPORT
00651 Int2 Blast_HSPListsMerge(BlastHSPList** hsp_list, 
00652                    BlastHSPList** combined_hsp_list_ptr, 
00653                    Int4 hsp_num_max, Int4* split_points, 
00654                    Int4 contexts_per_query,
00655                    Int4 chunk_overlap_size,
00656                    Boolean allow_gap);
00657                    
00658 /** Adjust subject offsets in an HSP list if only part of the subject sequence
00659  * was searched. Used when long subject sequence is split into more manageable
00660  * chunks.
00661  * @param hsp_list List of HSPs from a chunk of a subject sequence [in]
00662  * @param offset Offset where the chunk starts [in]
00663  */
00664 NCBI_XBLAST_EXPORT
00665 void Blast_HSPListAdjustOffsets(BlastHSPList* hsp_list, Int4 offset);
00666 
00667 /** For nucleotide BLAST, if the match reward score is equal to 2, 
00668  * random alignments are dominated by runs of exact matches, which all have even
00669  * scores. This makes it impossible to estimate statistical parameters correctly
00670  * for odd scores. Hence the raw score formula is adjusted - all scores are
00671  * rounded down to the nearest even value in order to provide a conservative estimate.
00672  * @param hsp_list HSP list structure to adjust scores for. [in] [out]
00673  * @param gapped_calculation not an ungapped alignment [in]
00674  * @param sbp used for round_down Boolean
00675  */
00676 NCBI_XBLAST_EXPORT
00677 void Blast_HSPListAdjustOddBlastnScores(BlastHSPList* hsp_list, 
00678                                         Boolean gapped_calculation, 
00679                                         const BlastScoreBlk* sbp);
00680 
00681 /** Check if HSP list is sorted by score.
00682  * @param hsp_list The list to check [in]
00683  * @return TRUE if sorted, FALSE if not.
00684  */
00685 NCBI_XBLAST_EXPORT
00686 Boolean Blast_HSPListIsSortedByScore(const BlastHSPList* hsp_list);
00687 
00688 /** Sort the HSPs in an HSP list by score. This type of sorting is done before
00689  * the e-values are calcaulted, and also at the beginning of the traceback stage, 
00690  * where it is needed to eliminate the effects of wrong score order because of 
00691  * application of sum statistics. 
00692  * Checks if the HSP array is already sorted before proceeding with quicksort.
00693  * @param hsp_list Structure containing array of HSPs to be sorted. [in] [out]
00694  */
00695 NCBI_XBLAST_EXPORT
00696 void Blast_HSPListSortByScore(BlastHSPList* hsp_list);
00697 
00698 /** Sort the HSPs in an HSP list by e-value, with scores and other criteria
00699  * used to resolve ties. Checks if the HSP array is already sorted before 
00700  * proceeding with quicksort.
00701  * @param hsp_list Structure containing array of HSPs to be sorted. [in] [out]
00702  */
00703 NCBI_XBLAST_EXPORT
00704 void Blast_HSPListSortByEvalue(BlastHSPList* hsp_list);
00705 
00706 /********************************************************************************
00707           HitList API.
00708 ********************************************************************************/
00709 
00710 /** Allocate memory for a hit list of a given size.
00711  * @param hitlist_size Size of the hit list (number of HSP lists) [in]
00712  */
00713 NCBI_XBLAST_EXPORT
00714 BlastHitList* Blast_HitListNew(Int4 hitlist_size);
00715 
00716 /** Deallocate memory for the hit list */
00717 NCBI_XBLAST_EXPORT
00718 BlastHitList* Blast_HitListFree(BlastHitList* hitlist);
00719 
00720 /** Deallocate memory for every HSP list on BlastHitList,
00721  *  as well as all their components.
00722  * @param hitlist contains the BlastHSPList array to be freed [in/out]. 
00723 */
00724 NCBI_XBLAST_EXPORT
00725 Int2 Blast_HitListHSPListsFree(BlastHitList* hitlist);
00726 
00727 /** Insert a new HSP list into the hit list.
00728  * Before capacity of the hit list is reached, just add to the end;
00729  * After that, store in a heap, to ensure efficient insertion and deletion.
00730  * The heap order is reverse, with worst e-value on top, for convenience
00731  * of deletion.
00732  * @param hit_list Contains all HSP lists saved so far [in] [out]
00733  * @param hsp_list A new HSP list to be inserted into the hit list [in]
00734 */
00735 NCBI_XBLAST_EXPORT
00736 Int2 Blast_HitListUpdate(BlastHitList* hit_list, BlastHSPList* hsp_list);
00737 
00738 /** Combine two hitlists; both HitLists must contain HSPs that
00739  * represent alignments to the same query sequence
00740  * @param old_hit_list_ptr Pointer to original HitList, will be NULLed 
00741  *                          out on return [in|out]
00742  * @param combined_hit_list_ptr Pointer to the combined HitList [in|out]
00743  * @param contexts_per_query The number of different contexts that can
00744  *             occur in hits from old_hit_list and combined_hit_list [in]
00745  * @param split_offsets the query offset that marks the boundary between
00746  *             combined_hit_list and old_hit_list. HSPs in old_hit_list
00747  *             that hit to context i are assumed to lie to the right 
00748  *             of split_offsets[i] [in]
00749  * @param chunk_overlap_size The length of the overlap region between the
00750  *                    sequence region containing hit_list and that
00751  *                    containing combined_hit_list [in]
00752  * @param allow_gap Allow merging HSPs at different diagonals [in]
00753 */
00754 NCBI_XBLAST_EXPORT
00755 Int2 Blast_HitListMerge(BlastHitList** old_hit_list_ptr,
00756                         BlastHitList** combined_hit_list_ptr,
00757                         Int4 contexts_per_query, Int4 *split_offsets,
00758                         Int4 chunk_overlap_size, Boolean allow_gap);
00759 
00760 /** Purges a BlastHitList of NULL HSP lists.
00761  * @param hit_list BLAST hit list to purge. [in] [out]
00762  */
00763 NCBI_XBLAST_EXPORT
00764 Int2 
00765 Blast_HitListPurgeNullHSPLists(BlastHitList* hit_list);
00766 /********************************************************************************
00767           HSPResults API.
00768 ********************************************************************************/
00769 
00770 /** Initialize the results structure.
00771  * @param num_queries Number of query sequences to allocate results structure
00772  *                    for [in]
00773  */
00774 NCBI_XBLAST_EXPORT
00775 BlastHSPResults* Blast_HSPResultsNew(Int4 num_queries);
00776 
00777 /** Deallocate memory for BLAST results */
00778 NCBI_XBLAST_EXPORT
00779 BlastHSPResults* Blast_HSPResultsFree(BlastHSPResults* results);
00780 
00781 /** Sort each hit list in the BLAST results by best e-value */
00782 NCBI_XBLAST_EXPORT
00783 Int2 Blast_HSPResultsSortByEvalue(BlastHSPResults* results);
00784 /** Sort each hit list in the BLAST results by best e-value, in reverse
00785     order. */
00786 NCBI_XBLAST_EXPORT
00787 Int2 Blast_HSPResultsReverseSort(BlastHSPResults* results);
00788 
00789 /** Reverse order of HSP lists in each hit list in the BLAST results. 
00790  * This allows to return HSP lists from the end of the arrays when reading
00791  * from a collector HSP stream.
00792  */
00793 NCBI_XBLAST_EXPORT
00794 Int2 Blast_HSPResultsReverseOrder(BlastHSPResults* results);
00795 
00796 /** Blast_HSPResultsInsertHSPList
00797  * Insert an HSP list to the appropriate place in the results structure.
00798  * All HSPs in this list must be from the same query and same subject; the oid
00799  * and query_index fields must be set in the BlastHSPList input structure.
00800  * @param results The structure holding results for all queries [in] [out]
00801  * @param hsp_list The results for one query-subject sequence pair. [in]
00802  * @param hitlist_size Maximal allowed hit list size. [in]
00803  */
00804 NCBI_XBLAST_EXPORT
00805 Int2 Blast_HSPResultsInsertHSPList(BlastHSPResults* results, 
00806         BlastHSPList* hsp_list, Int4 hitlist_size);
00807 
00808 /* Forward declaration */
00809 struct BlastHSPStream;
00810 
00811 /** Move all of the hits within an HSPStream into a BlastHSPResults
00812  * structure.
00813  * @param hsp_stream The HSPStream [in][out]
00814  * @param num_queries Number of queries in the search [in]
00815  * @param hit_param Hit parameters [in]
00816  * @return The generated collection of HSP results
00817  */
00818 NCBI_XBLAST_EXPORT
00819 BlastHSPResults*
00820 Blast_HSPResultsFromHSPStream(struct BlastHSPStream* hsp_stream, 
00821                               size_t num_queries, 
00822                               SBlastHitsParameters* hit_param);
00823 
00824 /** As Blast_HSPResultsFromHSPStream, except the total number of
00825  * HSPs kept for each query does not exceed an explicit limit.
00826  * The database sequences with the smallest number of hits are
00827  * saved first, and hits are removed from query i if the average
00828  * number of hits saved threatens to exceed (max_num_hsps / (number
00829  * of DB sequences with hits to query i))
00830  * @param hsp_stream The HSPStream [in][out]
00831  * @param num_queries Number of queries in the search [in]
00832  * @param hit_param Hit parameters [in]
00833  * @param max_num_hsps The limit on the number of HSPs to be
00834  *                     kept for each query sequence [in]
00835  * @param removed_hsps Set to TRUE if any hits were removed [out]
00836  * @return The generated collection of HSP results
00837  */
00838 BlastHSPResults*
00839 Blast_HSPResultsFromHSPStreamWithLimit(struct BlastHSPStream* hsp_stream, 
00840                                    Uint4 num_queries, 
00841                                    SBlastHitsParameters* hit_param,
00842                                    Uint4 max_num_hsps,
00843                                    Boolean* removed_hsps);
00844 
00845 BlastHSPResults*
00846 /** As Blast_HSPResultsFromHSPStreamWithLimit, except accept and return 
00847  * array of Boolen flags specifying which query exceeded HSP limits.
00848  */
00849 Blast_HSPResultsFromHSPStreamWithLimitEx(struct BlastHSPStream* hsp_stream, 
00850                                    Uint4 num_queries, 
00851                                    SBlastHitsParameters* hit_param,
00852                                    Uint4 max_num_hsps,
00853                                    Boolean* removed_hsps);
00854 /** Splits the BlastHSPResults structure for a PHI BLAST search into an array of
00855  * BlastHSPResults structures, corresponding to different pattern occurrences in
00856  * query. All HSPs are copied, so it is safe to free the returned 
00857  * BlastHSPResults structures independently of the input results structure.
00858  * @param results All results from a PHI BLAST search, with HSPs for 
00859  *                different query pattern occurrences mixed together. [in]
00860  * @param pattern_info Information about pattern occurrences in query. [in]
00861  * @return Array of pointers to BlastHSPResults structures, corresponding to 
00862  *         different pattern occurrences.
00863  */
00864 NCBI_XBLAST_EXPORT
00865 BlastHSPResults** 
00866 PHIBlast_HSPResultsSplit(const BlastHSPResults* results, 
00867                          const SPHIQueryInfo* pattern_info);
00868 
00869 
00870 /** Count the number of occurrences of pattern in sequence, which
00871  * do not overlap by more than half the pattern match length. 
00872  * @param query_info Query information structure, containing pattern info. [in]
00873  */
00874 NCBI_XBLAST_EXPORT
00875 Int4
00876 PhiBlastGetEffectiveNumberOfPatterns(const BlastQueryInfo *query_info);
00877 
00878 /** Apply Cross_match like masklevel to HSP list.  -RMH-
00879  */
00880 Int2 Blast_HSPResultsApplyMasklevel(BlastHSPResults *results,
00881                                     const BlastQueryInfo *query_info,
00882                                     Int4 masklevel, Int4 query_length);
00883 
00884 #ifdef __cplusplus
00885 }
00886 #endif
00887 #endif /* !ALGO_BLAST_CORE__BLAST_HITS__H */
Modified on Thu Jul 24 19:36:04 2014 by modify_doxy.py rev. 426318