src/algo/blast/core/blast_setup.c

Go to the documentation of this file.
00001 /* $Id: blast_setup.c 134303 2008-07-17 17:42:49Z camacho $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author: Tom Madden
00027  *
00028  */
00029 
00030 /** @file blast_setup.c
00031  * Utilities initialize/setup BLAST.
00032  */
00033 
00034 
00035 #ifndef SKIP_DOXYGEN_PROCESSING
00036 static char const rcsid[] =
00037     "$Id: blast_setup.c 134303 2008-07-17 17:42:49Z camacho $";
00038 #endif /* SKIP_DOXYGEN_PROCESSING */
00039 
00040 #include <algo/blast/core/blast_setup.h>
00041 #include <algo/blast/core/blast_util.h>
00042 #include <algo/blast/core/blast_filter.h>
00043 
00044 /* See description in blast_setup.h */
00045 Int2
00046 Blast_ScoreBlkKbpGappedCalc(BlastScoreBlk * sbp,
00047                             const BlastScoringOptions * scoring_options,
00048                             EBlastProgramType program, 
00049                             const BlastQueryInfo * query_info,
00050                             Blast_Message** error_return)
00051 {
00052     Int4 index = 0;
00053 
00054     if (sbp == NULL || scoring_options == NULL) {
00055         Blast_PerrorWithLocation(error_return, BLASTERR_INVALIDPARAM, -1);
00056         return 1;
00057     }
00058 
00059     /* Allocate and fill values for a gapped Karlin block, given the scoring
00060        options, then copy it for all the query contexts, as long as they're
00061        contexts that will be searched (i.e.: valid) */
00062     for (index = query_info->first_context;
00063          index <= query_info->last_context; index++) {
00064         Int2 retval = 0;
00065 
00066         if ( !query_info->contexts[index].is_valid ) {
00067             continue;
00068         }
00069 
00070         sbp->kbp_gap_std[index] = Blast_KarlinBlkNew();
00071 
00072         /* At this stage query sequences are nucleotide only for blastn */
00073         if (program == eBlastTypeBlastn) {
00074             retval = 
00075                 Blast_KarlinBlkNuclGappedCalc(sbp->kbp_gap_std[index],
00076                     scoring_options->gap_open, scoring_options->gap_extend, 
00077                     scoring_options->reward, scoring_options->penalty, 
00078                     sbp->kbp_std[index], &(sbp->round_down), error_return);
00079         } else {
00080             retval = 
00081                 Blast_KarlinBlkGappedCalc(sbp->kbp_gap_std[index],
00082                     scoring_options->gap_open, scoring_options->gap_extend,
00083                     sbp->name, error_return);
00084         }
00085         if (retval) {
00086             return retval;
00087         }
00088 
00089         /* For right now, copy the contents from kbp_gap_std to 
00090          * kbp_gap_psi (as in old code - BLASTSetUpSearchInternalByLoc) */
00091         if (program != eBlastTypeBlastn) {
00092             sbp->kbp_gap_psi[index] = Blast_KarlinBlkNew();
00093             Blast_KarlinBlkCopy(sbp->kbp_gap_psi[index], 
00094                                 sbp->kbp_gap_std[index]);
00095         }
00096     }
00097 
00098     /* Set gapped Blast_KarlinBlk* alias */
00099     sbp->kbp_gap = Blast_QueryIsPssm(program) ? 
00100         sbp->kbp_gap_psi : sbp->kbp_gap_std;
00101 
00102     return 0;
00103 }
00104 
00105 /** Fills a scoring block structure for a PHI BLAST search. 
00106  * @param sbp Scoring block structure [in] [out]
00107  * @param options Scoring options structure [in]
00108  * @param blast_message Structure for reporting errors [out]
00109  * @param get_path callback function for matrix path [in]
00110  */
00111 static Int2
00112 s_PHIScoreBlkFill(BlastScoreBlk* sbp, const BlastScoringOptions* options,
00113    Blast_Message** blast_message, GET_MATRIX_PATH get_path)
00114 {
00115    Blast_KarlinBlk* kbp;
00116    char buffer[1024];
00117    Int2 status = 0;
00118 
00119    sbp->read_in_matrix = TRUE;
00120    if ((status = Blast_ScoreBlkMatrixFill(sbp, get_path)) != 0)
00121       return status;
00122    kbp = sbp->kbp_gap_std[0] = Blast_KarlinBlkNew();
00123    /* Point both non-allocated Karlin block arrays to kbp_gap_std. */
00124    sbp->kbp_gap = sbp->kbp_gap_std;
00125 
00126    /* For PHI BLAST, the H value is not used, but it is not allowed to be 0, 
00127       so set it to 1. */
00128    kbp->H = 1.0;
00129 
00130    /* This is populated so that the checks for valid contexts don't fail,
00131     * note that this field is not used at all during a PHI-BLAST search */
00132    sbp->sfp[0] = Blast_ScoreFreqNew(sbp->loscore, sbp->hiscore);
00133 
00134    /* Ideal Karlin block is filled unconditionally. */
00135    status = Blast_ScoreBlkKbpIdealCalc(sbp);
00136    if (status)
00137       return status;
00138 
00139    if (0 == strcmp("BLOSUM62", options->matrix)) {
00140       kbp->paramC = 0.50;
00141       if ((11 == options->gap_open) && (1 == options->gap_extend)) {
00142          kbp->Lambda = 0.270;
00143          kbp->K = 0.047;
00144       } else if ((9 == options->gap_open) && (2 == options->gap_extend)) {
00145          kbp->Lambda = 0.285;
00146          kbp->K = 0.075;
00147       } else if ((8 == options->gap_open) && (2 == options->gap_extend)) {
00148          kbp->Lambda = 0.265;
00149          kbp->K = 0.046;
00150       } else if ((7 == options->gap_open) && (2 == options->gap_extend)) {
00151          kbp->Lambda = 0.243;
00152          kbp->K = 0.032;
00153       } else if ((12 == options->gap_open) && (1 == options->gap_extend)) {
00154          kbp->Lambda = 0.281;
00155          kbp->K = 0.057;
00156       } else if ((10 == options->gap_open) && (1 == options->gap_extend)) {
00157          kbp->Lambda = 0.250;
00158          kbp->K = 0.033;
00159       } else {
00160           status = -1;
00161       }
00162    } else if (0 == strcmp("PAM30", options->matrix)) { 
00163        kbp->paramC = 0.30;
00164        if ((9 == options->gap_open) && (1 == options->gap_extend)) {
00165            kbp->Lambda = 0.295;
00166            kbp->K = 0.13;
00167        } else if ((7 == options->gap_open) && (2 == options->gap_extend)) {
00168            kbp->Lambda = 0.306;
00169            kbp->K = 0.15;
00170            return status;
00171        } else if ((6 == options->gap_open) && (2 == options->gap_extend)) {
00172            kbp->Lambda = 0.292;
00173            kbp->K = 0.13;
00174        } else if ((5 == options->gap_open) && (2 == options->gap_extend)) {
00175            kbp->Lambda = 0.263;
00176            kbp->K = 0.077;
00177        } else if ((10 == options->gap_open) && (1 == options->gap_extend)) {
00178            kbp->Lambda = 0.309;
00179            kbp->K = 0.15;
00180        } else if ((8 == options->gap_open) && (1 == options->gap_extend)) {
00181            kbp->Lambda = 0.270;
00182            kbp->K = 0.070;
00183            return status;
00184        } else {
00185            status = -1;
00186        }
00187    } else if (0 == strcmp("PAM70", options->matrix)) { 
00188        kbp->paramC = 0.35;
00189        if ((10 == options->gap_open) && (1 == options->gap_extend)) {
00190            kbp->Lambda = 0.291;
00191            kbp->K = 0.089;
00192        } else if ((8 == options->gap_open) && (2 == options->gap_extend)) {
00193            kbp->Lambda = 0.303;
00194            kbp->K = 0.13;
00195        } else if ((7 == options->gap_open) && (2 == options->gap_extend)) {
00196            kbp->Lambda = 0.287;
00197            kbp->K = 0.095;
00198        } else if ((6 == options->gap_open) && (2 == options->gap_extend)) {
00199            kbp->Lambda = 0.269;
00200            kbp->K = 0.079;
00201        } else if ((11 == options->gap_open) && (1 == options->gap_extend)) {
00202            kbp->Lambda = 0.307;
00203            kbp->K = 0.13;
00204        } else if ((9 == options->gap_open) && (1 == options->gap_extend)) {
00205            kbp->Lambda = 0.269;
00206            kbp->K = 0.058;
00207        } else {
00208            status = -1;
00209        }
00210    } else if (0 == strcmp("BLOSUM80", options->matrix)) { 
00211        kbp->paramC = 0.40;
00212        if ((10 == options->gap_open) && (1 == options->gap_extend)) {
00213            kbp->Lambda = 0.300;
00214            kbp->K = 0.072;
00215        } else if ((8 == options->gap_open) && (2 == options->gap_extend)) {
00216            kbp->Lambda = 0.308;
00217            kbp->K = 0.089;
00218        } else if ((7 == options->gap_open) && (2 == options->gap_extend)) {
00219            kbp->Lambda = 0.295;
00220            kbp->K = 0.077;
00221        } else if ((6 == options->gap_open) && (2 == options->gap_extend)) {
00222            kbp->Lambda = 0.271;
00223            kbp->K = 0.051;
00224        } else if ((11 == options->gap_open) && (1 == options->gap_extend)) {
00225            kbp->Lambda = 0.314;
00226            kbp->K = 0.096;
00227            return status;
00228        } else if ((9 == options->gap_open) && (1 == options->gap_extend)) {
00229            kbp->Lambda = 0.277;
00230            kbp->K = 0.046;
00231        } else {
00232            status = -1;
00233        }
00234    } else if (0 == strcmp("BLOSUM45", options->matrix)) { 
00235        kbp->paramC = 0.60;
00236        if ((14 == options->gap_open) && (2 == options->gap_extend)) {
00237            kbp->Lambda = 0.199;
00238            kbp->K = 0.040;
00239        } else if ((13 == options->gap_open) && (3 == options->gap_extend)) {
00240            kbp->Lambda = 0.209;
00241            kbp->K = 0.057;
00242        } else if ((12 == options->gap_open) && (3 == options->gap_extend)) {
00243            kbp->Lambda = 0.203;
00244             kbp->K = 0.049;
00245        } else if ((11 == options->gap_open) && (3 == options->gap_extend)) {
00246            kbp->Lambda = 0.193;
00247            kbp->K = 0.037;
00248        } else if ((10 == options->gap_open) && (3 == options->gap_extend)) {
00249            kbp->Lambda = 0.182;
00250            kbp->K = 0.029;
00251        } else if ((15 == options->gap_open) && (2 == options->gap_extend)) {
00252            kbp->Lambda = 0.206;
00253            kbp->K = 0.049;
00254        } else if ((13 == options->gap_open) && (2 == options->gap_extend)) {
00255            kbp->Lambda = 0.190;
00256            kbp->K = 0.032;
00257        } else if ((12 == options->gap_open) && (2 == options->gap_extend)) {
00258            kbp->Lambda = 0.177;
00259            kbp->K = 0.023;
00260        } else if ((19 == options->gap_open) && (1 == options->gap_extend)) {
00261            kbp->Lambda = 0.209;
00262            kbp->K = 0.049;
00263        } else if ((18 == options->gap_open) && (1 == options->gap_extend)) {
00264            kbp->Lambda = 0.202;
00265            kbp->K = 0.041;
00266        } else if ((17 == options->gap_open) && (1 == options->gap_extend)) {
00267            kbp->Lambda = 0.195;
00268            kbp->K = 0.034;
00269        } else if ((16 == options->gap_open) && (1 == options->gap_extend)) {
00270            kbp->Lambda = 0.183;
00271            kbp->K = 0.024;
00272        } else {
00273            status = -1;
00274        }
00275    } else {
00276        status = -2;
00277    }
00278 
00279    if (status == -1) {
00280        sprintf(buffer, "The combination %d for gap opening cost and %d for "
00281                "gap extension is not supported in PHI-BLAST with matrix %s\n",
00282                options->gap_open, options->gap_extend, options->matrix);
00283    } else if (status == -2) {
00284        sprintf(buffer, "Matrix %s not allowed in PHI-BLAST\n", options->matrix);
00285    }
00286    if (status) 
00287        Blast_MessageWrite(blast_message, eBlastSevWarning, kBlastMessageNoContext, buffer);
00288    else {
00289        /* Put a copy the Karlin block into the kbp_std array */
00290        sbp->kbp_std[0] = (Blast_KarlinBlk*) 
00291            BlastMemDup(sbp->kbp_gap_std[0], sizeof(Blast_KarlinBlk));
00292        sbp->kbp = sbp->kbp_std;
00293    }
00294 
00295    return status;
00296 }
00297 
00298 Int2
00299 Blast_ScoreBlkMatrixInit(EBlastProgramType program_number, 
00300                   const BlastScoringOptions* scoring_options,
00301                   BlastScoreBlk* sbp,
00302                   GET_MATRIX_PATH get_path)
00303 {
00304     Int2 status = 0;
00305 
00306     if ( !sbp || !scoring_options ) {
00307         return 1;
00308     }
00309 
00310     if (program_number == eBlastTypeBlastn) {
00311 
00312         BLAST_ScoreSetAmbigRes(sbp, 'N');
00313         sbp->penalty = scoring_options->penalty;
00314         sbp->reward = scoring_options->reward;
00315         if (scoring_options->matrix && *scoring_options->matrix != NULLB) {
00316  
00317             sbp->read_in_matrix = TRUE;
00318             sbp->name = strdup(scoring_options->matrix);
00319  
00320         } else {
00321             char buffer[50];
00322             sbp->read_in_matrix = FALSE;
00323             sprintf(buffer, "blastn matrix:%ld %ld",
00324                     (long) sbp->reward, (long) sbp->penalty);
00325             sbp->name = strdup(buffer);
00326         }
00327  
00328      } else {
00329         sbp->read_in_matrix = TRUE;
00330         BLAST_ScoreSetAmbigRes(sbp, 'X');
00331         sbp->name = BLAST_StrToUpper(scoring_options->matrix);
00332     }
00333     status = Blast_ScoreBlkMatrixFill(sbp, get_path);
00334     if (status) {
00335         return status;
00336     }
00337 
00338     return status;
00339 }
00340 
00341 Int2 
00342 BlastSetup_ScoreBlkInit(BLAST_SequenceBlk* query_blk, 
00343                         const BlastQueryInfo* query_info, 
00344                         const BlastScoringOptions* scoring_options, 
00345                         EBlastProgramType program_number, 
00346                         BlastScoreBlk* *sbpp, 
00347                         double scale_factor, 
00348                         Blast_Message* *blast_message,
00349                         GET_MATRIX_PATH get_path)
00350 {
00351     BlastScoreBlk* sbp;
00352     Int2 status=0;      /* return value. */
00353     ASSERT(blast_message);
00354 
00355     if (sbpp == NULL)
00356        return 1;
00357 
00358     if (program_number == eBlastTypeBlastn)
00359        sbp = BlastScoreBlkNew(BLASTNA_SEQ_CODE, query_info->last_context + 1);
00360     else
00361        sbp = BlastScoreBlkNew(BLASTAA_SEQ_CODE, query_info->last_context + 1);
00362 
00363     if (!sbp) {
00364        Blast_PerrorWithLocation(blast_message, BLASTERR_MEMORY, -1);
00365        return 1;
00366     }
00367 
00368     *sbpp = sbp;
00369     sbp->scale_factor = scale_factor;
00370 
00371     status = Blast_ScoreBlkMatrixInit(program_number, scoring_options, sbp, get_path);
00372     if (status) {
00373         Blast_Perror(blast_message, status, -1);
00374         return status;
00375     }
00376 
00377     /* Fills in block for gapped blast. */
00378     if (Blast_ProgramIsPhiBlast(program_number)) {
00379        status = s_PHIScoreBlkFill(sbp, scoring_options, blast_message, get_path);
00380     } else {
00381        status = Blast_ScoreBlkKbpUngappedCalc(program_number, sbp, query_blk->sequence, 
00382                query_info, blast_message);
00383 
00384        if (scoring_options->gapped_calculation) {
00385           status = 
00386               Blast_ScoreBlkKbpGappedCalc(sbp, scoring_options, program_number,
00387                                           query_info, blast_message);
00388        } else {
00389           ASSERT(sbp->kbp_gap == NULL);
00390        }
00391     }
00392 
00393     return status;
00394 }
00395 
00396 Int2
00397 BlastSetup_Validate(const BlastQueryInfo* query_info, 
00398                     const BlastScoreBlk* score_blk) 
00399 {
00400     int index;
00401     Boolean valid_context_found = FALSE;
00402     ASSERT(query_info);
00403 
00404     for (index = query_info->first_context;
00405          index <= query_info->last_context;
00406          index++) {
00407         if (query_info->contexts[index].is_valid) {
00408             valid_context_found = TRUE;
00409         } else if (score_blk) {
00410             ASSERT(score_blk->kbp[index] == NULL);
00411             ASSERT(score_blk->sfp[index] == NULL);
00412             if (score_blk->kbp_gap) {
00413                 ASSERT(score_blk->kbp_gap[index] == NULL);
00414             }
00415         }
00416     }
00417 
00418     if (valid_context_found) {
00419         return 0;
00420     } else {
00421         return 1;
00422     }
00423 }
00424 
00425 Int2 BLAST_MainSetUp(EBlastProgramType program_number,
00426     const QuerySetUpOptions *qsup_options,
00427     const BlastScoringOptions *scoring_options,
00428     BLAST_SequenceBlk *query_blk,
00429     const BlastQueryInfo *query_info,
00430     double scale_factor,
00431     BlastSeqLoc **lookup_segments, 
00432     BlastMaskLoc **mask,
00433     BlastScoreBlk **sbpp, 
00434     Blast_Message **blast_message,
00435     GET_MATRIX_PATH get_path)
00436 {
00437     Boolean mask_at_hash = FALSE; /* mask only for making lookup table? */
00438     Int2 status = 0;            /* return value */
00439     BlastMaskLoc *filter_maskloc = NULL;   /* Local variable for mask locs. */
00440 
00441     SBlastFilterOptions* filter_options = qsup_options->filtering_options;
00442     Boolean filter_options_allocated = FALSE;
00443 
00444     ASSERT(blast_message);
00445 
00446     if (mask)
00447         *mask = NULL;
00448 
00449     if (filter_options == NULL && qsup_options->filter_string)
00450     {
00451          status = BlastFilteringOptionsFromString(program_number, 
00452                                                   qsup_options->filter_string, 
00453                                                   &filter_options, 
00454                                                   blast_message);
00455          if (status) {
00456             filter_options = SBlastFilterOptionsFree(filter_options);
00457             return status;
00458          }
00459          filter_options_allocated = TRUE;
00460     }
00461     ASSERT(filter_options);
00462 
00463     status = BlastSetUp_GetFilteringLocations(query_blk, 
00464                                               query_info, 
00465                                               program_number, 
00466                                               filter_options,
00467                                               & filter_maskloc, 
00468                                               blast_message);
00469 
00470     if (status) {
00471         if (filter_options_allocated)
00472             filter_options = SBlastFilterOptionsFree(filter_options);
00473         return status;
00474     } 
00475 
00476     mask_at_hash = SBlastFilterOptionsMaskAtHash(filter_options);
00477 
00478     if (filter_options_allocated) {
00479         filter_options = SBlastFilterOptionsFree(filter_options);
00480     }
00481 
00482 
00483     if (!mask_at_hash) {
00484         BlastSetUp_MaskQuery(query_blk, query_info, filter_maskloc, 
00485                              program_number);
00486     }
00487 
00488     if (program_number == eBlastTypeBlastx && scoring_options->is_ooframe) {
00489         BLAST_CreateMixedFrameDNATranslation(query_blk, query_info);
00490     }
00491 
00492     /* Find complement of the mask locations, for which lookup table will be
00493      * created. This should only be done if we do want to create a lookup table,
00494      * i.e. if it is a full search, not a traceback-only search. 
00495      */
00496     if (lookup_segments) {
00497         BLAST_ComplementMaskLocations(program_number, query_info, 
00498                                       filter_maskloc, lookup_segments);
00499     }
00500 
00501     if (mask)
00502     {
00503         if (Blast_QueryIsTranslated(program_number)) {
00504             /* Filter locations so far are in protein coordinates; 
00505                convert them back to nucleotide here. */
00506             BlastMaskLocProteinToDNA(filter_maskloc, query_info);
00507         }
00508         *mask = filter_maskloc;
00509         filter_maskloc = NULL;
00510     }
00511     else 
00512         filter_maskloc = BlastMaskLocFree(filter_maskloc);
00513 
00514     status = BlastSetup_ScoreBlkInit(query_blk, query_info, scoring_options, 
00515                                      program_number, sbpp, scale_factor, 
00516                                      blast_message, get_path);
00517     if (status) {
00518         return status;
00519     }
00520 
00521     if ( (status = BlastSetup_Validate(query_info, *sbpp) != 0)) {
00522         if (*blast_message == NULL) {
00523             Blast_Perror(blast_message, BLASTERR_INVALIDQUERIES, -1);
00524         }
00525         return 1;
00526     }
00527 
00528     return status;
00529 }
00530 
00531 /** Return the search space appropriate for a given context from
00532  * a list of tabulated search spaces
00533  * @param eff_len_options Container for search spaces [in]
00534  * @param context_index Identifier for the search space to return [in]
00535  * @param blast_message List of messages, to receive possible warnings [in][out]
00536  * @return The selected search space
00537  */
00538 static Int8 s_GetEffectiveSearchSpaceForContext(
00539                         const BlastEffectiveLengthsOptions* eff_len_options,
00540                         int context_index, Blast_Message **blast_message)
00541 {
00542     Int8 retval = 0;
00543 
00544     if (eff_len_options->num_searchspaces == 0) {
00545         retval = 0;
00546     } else if (eff_len_options->num_searchspaces == 1) {
00547         if (context_index != 0) {
00548             Blast_MessageWrite(blast_message, eBlastSevWarning, context_index, 
00549                     "One search space is being used for multiple sequences");
00550         }
00551         retval = eff_len_options->searchsp_eff[0];
00552     } else if (eff_len_options->num_searchspaces > 1) {
00553         ASSERT(context_index < eff_len_options->num_searchspaces);
00554         retval = eff_len_options->searchsp_eff[context_index];
00555     } else {
00556         abort();    /* should never happen */
00557     }
00558     return retval;
00559 }
00560 
00561 Int2 BLAST_CalcEffLengths (EBlastProgramType program_number, 
00562    const BlastScoringOptions* scoring_options,
00563    const BlastEffectiveLengthsParameters* eff_len_params, 
00564    const BlastScoreBlk* sbp, BlastQueryInfo* query_info,
00565    Blast_Message * *blast_message)
00566 {
00567    double alpha=0, beta=0; /*alpha and beta for new scoring system */
00568    Int4 index;      /* loop index. */
00569    Int4 db_num_seqs;    /* number of sequences in database. */
00570    Int8 db_length;  /* total length of database. */
00571    Blast_KarlinBlk* *kbp_ptr; /* Array of Karlin block pointers */
00572    const BlastEffectiveLengthsOptions* eff_len_options = eff_len_params->options;
00573 
00574    if (!query_info || !sbp)
00575       return -1;
00576 
00577 
00578    /* use overriding value from effective lengths options or the real value
00579       from effective lengths parameters. */
00580    if (eff_len_options->db_length > 0)
00581       db_length = eff_len_options->db_length;
00582    else
00583       db_length = eff_len_params->real_db_length;
00584 
00585    /* If database (subject) length is not available at this stage, and
00586     * overriding value of effective search space is not provided by user,
00587     * do nothing.
00588     * This situation can occur in the initial set up for a non-database search,
00589     * where each subject is treated as an individual database. 
00590     */
00591    if (db_length == 0 &&
00592        !BlastEffectiveLengthsOptions_IsSearchSpaceSet(eff_len_options)) {
00593       return 0;
00594    }
00595 
00596    if (Blast_SubjectIsTranslated(program_number))
00597       db_length = db_length/3;  
00598 
00599    if (eff_len_options->dbseq_num > 0)
00600       db_num_seqs = eff_len_options->dbseq_num;
00601    else
00602       db_num_seqs = eff_len_params->real_num_seqs;
00603    
00604    /* PHI BLAST search space calculation is different. */
00605    if (Blast_ProgramIsPhiBlast(program_number))
00606    {
00607         for (index = query_info->first_context;
00608            index <= query_info->last_context;
00609            index++) {
00610            Int8 effective_search_space = db_length - (db_num_seqs*(query_info->contexts[index].length_adjustment));
00611            query_info->contexts[index].eff_searchsp = effective_search_space;
00612         }
00613        
00614        return 0;
00615    }
00616 
00617    /* N.B.: the old code used kbp_gap_std instead of the kbp_gap alias (which
00618     * could be kbp_gap_psi), hence we duplicate that behavior here */
00619    kbp_ptr = (scoring_options->gapped_calculation ? sbp->kbp_gap_std : sbp->kbp);
00620    
00621    for (index = query_info->first_context;
00622         index <= query_info->last_context;
00623         index++) {
00624       Blast_KarlinBlk *kbp; /* statistical parameters for the current context */
00625       Int4 length_adjustment = 0; /* length adjustment for current iteration. */
00626       Int4 query_length;   /* length of an individual query sequence */
00627       
00628       /* Effective search space for a given sequence/strand/frame */
00629       Int8 effective_search_space =
00630           s_GetEffectiveSearchSpaceForContext(eff_len_options, index,
00631                                               blast_message);
00632 
00633       kbp = kbp_ptr[index];
00634       
00635       if (query_info->contexts[index].is_valid &&
00636           ((query_length = query_info->contexts[index].query_length) > 0) ) {
00637 
00638          /* Use the correct Karlin block. For blastn, two identical Karlin
00639           * blocks are allocated for each sequence (one per strand), but we
00640           * only need one of them.
00641           */
00642          if (program_number == eBlastTypeBlastn) {
00643              Blast_GetNuclAlphaBeta(scoring_options->reward, 
00644                                     scoring_options->penalty, 
00645                                     scoring_options->gap_open, 
00646                                     scoring_options->gap_extend, 
00647                                     sbp->kbp_std[index], 
00648                                     scoring_options->gapped_calculation,
00649                                     &alpha, &beta);
00650          } else {
00651              BLAST_GetAlphaBeta(sbp->name, &alpha, &beta,
00652                                 scoring_options->gapped_calculation, 
00653                                 scoring_options->gap_open, 
00654                                 scoring_options->gap_extend, 
00655                                 sbp->kbp_std[index]);
00656          }
00657          BLAST_ComputeLengthAdjustment(kbp->K, kbp->logK,
00658                                        alpha/kbp->Lambda, beta,
00659                                        query_length, db_length,
00660                                        db_num_seqs, &length_adjustment);
00661 
00662          if (effective_search_space == 0) {
00663 
00664              /* if the database length was specified, do not
00665                 adjust it when calculating the search space;
00666                 it's counter-intuitive to specify a value and 
00667                 not have that value be used */
00668 
00669              Int8 effective_db_length = db_length;
00670 
00671              if (eff_len_options->db_length == 0)
00672                  effective_db_length -= (Int8)db_num_seqs * length_adjustment;
00673 
00674              effective_search_space = effective_db_length *
00675                              (query_length - length_adjustment);
00676          }
00677       }
00678       query_info->contexts[index].eff_searchsp = effective_search_space;
00679       query_info->contexts[index].length_adjustment = length_adjustment;
00680    }
00681    return 0;
00682 }
00683 
00684 void
00685 BLAST_GetSubjectTotals(const BlastSeqSrc* seqsrc,
00686                        Int8* total_length,
00687                        Int4* num_seqs)
00688 {
00689     ASSERT(total_length && num_seqs);
00690 
00691     *total_length = -1;
00692     *num_seqs = -1;
00693 
00694     if ( !seqsrc )  {
00695         return;
00696     }
00697 
00698     *total_length = BlastSeqSrcGetTotLenStats(seqsrc);
00699     if (*total_length <= 0)
00700        *total_length = BlastSeqSrcGetTotLen(seqsrc);
00701 
00702     if (*total_length > 0) {
00703         *num_seqs = BlastSeqSrcGetNumSeqsStats(seqsrc);
00704         if (*num_seqs <= 0)
00705            *num_seqs = BlastSeqSrcGetNumSeqs(seqsrc);
00706     } else {
00707         /* Not a database search; each subject sequence is considered
00708            individually */
00709         Int4 oid = 0;  /* Get length of first sequence. */
00710         if ( (*total_length = BlastSeqSrcGetSeqLen(seqsrc, (void*) &oid)) < 0) {
00711             *total_length = -1;
00712             *num_seqs = -1;
00713             return;
00714         }
00715         *num_seqs = 1;
00716     }
00717 }
00718 
00719 Int2 
00720 BLAST_GapAlignSetUp(EBlastProgramType program_number,
00721     const BlastSeqSrc* seq_src,
00722     const BlastScoringOptions* scoring_options,
00723     const BlastEffectiveLengthsOptions* eff_len_options,
00724     const BlastExtensionOptions* ext_options,
00725     const BlastHitSavingOptions* hit_options,
00726     BlastQueryInfo* query_info, 
00727     BlastScoreBlk* sbp, 
00728     BlastScoringParameters** score_params,
00729     BlastExtensionParameters** ext_params,
00730     BlastHitSavingParameters** hit_params,
00731     BlastEffectiveLengthsParameters** eff_len_params,
00732     BlastGapAlignStruct** gap_align)
00733 {
00734    Int2 status = 0;
00735    Uint4 max_subject_length;
00736    Int8 total_length;
00737    Int4 num_seqs;
00738 
00739    BLAST_GetSubjectTotals(seq_src, &total_length, &num_seqs);
00740 
00741    /* Initialize the effective length parameters with real values of
00742       database length and number of sequences */
00743    BlastEffectiveLengthsParametersNew(eff_len_options, total_length, num_seqs, 
00744                                       eff_len_params);
00745    /* Effective lengths are calculated for all programs except PHI BLAST. */
00746    if ((status = BLAST_CalcEffLengths(program_number, scoring_options, 
00747                      *eff_len_params, sbp, query_info, NULL)) != 0)
00748       return status;
00749 
00750    BlastScoringParametersNew(scoring_options, sbp, score_params);
00751 
00752    BlastExtensionParametersNew(program_number, ext_options, sbp, 
00753                                query_info, ext_params);
00754 
00755    BlastHitSavingParametersNew(program_number, hit_options, sbp, query_info, 
00756                                (Int4)(total_length/num_seqs), hit_params);
00757 
00758    /* To initialize the gapped alignment structure, we need to know the 
00759       maximal subject sequence length */
00760    max_subject_length = BlastSeqSrcGetMaxSeqLen(seq_src);
00761 
00762    if ((status = BLAST_GapAlignStructNew(*score_params, *ext_params, 
00763                     max_subject_length, sbp, gap_align)) != 0) {
00764       return status;
00765    }
00766 
00767    return status;
00768 }
00769 
00770 Int2 BLAST_OneSubjectUpdateParameters(EBlastProgramType program_number,
00771                     Uint4 subject_length,
00772                     const BlastScoringOptions* scoring_options,
00773                     BlastQueryInfo* query_info, 
00774                     const BlastScoreBlk* sbp, 
00775                     BlastHitSavingParameters* hit_params,
00776                     BlastInitialWordParameters* word_params,
00777                     BlastEffectiveLengthsParameters* eff_len_params)
00778 {
00779    Int2 status = 0;
00780    eff_len_params->real_db_length = subject_length;
00781    if ((status = BLAST_CalcEffLengths(program_number, scoring_options, 
00782                                 eff_len_params, sbp, query_info, NULL)) != 0)
00783       return status;
00784    /* Update cutoff scores in hit saving parameters */
00785    BlastHitSavingParametersUpdate(program_number, sbp, query_info, subject_length, 
00786                                   hit_params);
00787    
00788    if (word_params) {
00789       /* Update cutoff scores in initial word parameters */
00790       BlastInitialWordParametersUpdate(program_number, hit_params, sbp, query_info, 
00791            subject_length, word_params);
00792       /* Update the parameters for linking HSPs, if necessary. */
00793       BlastLinkHSPParametersUpdate(word_params, hit_params, scoring_options->gapped_calculation);
00794    }
00795    return status;
00796 }
00797 
00798 void
00799 BlastSeqLoc_RestrictToInterval(BlastSeqLoc* *mask, Int4 from, Int4 to)
00800 {
00801    BlastSeqLoc* head_loc = NULL, *last_loc = NULL, *next_loc, *seqloc;
00802    
00803    to = MAX(to, 0);
00804 
00805    /* If there is no mask, or if both coordinates passed are 0, which indicates
00806       the full sequence, just return - there is nothing to be done. */
00807    if (mask == NULL || *mask == NULL || (from == 0 && to == 0))
00808       return;
00809 
00810    for (seqloc = *mask; seqloc; seqloc = next_loc) {
00811       next_loc = seqloc->next;
00812       seqloc->ssr->left = MAX(0, seqloc->ssr->left - from);
00813       seqloc->ssr->right = MIN(seqloc->ssr->right, to) - from;
00814       /* If this mask location does not intersect the [from,to] interval,
00815          do not add it to the newly constructed list and free its contents. */
00816       if (seqloc->ssr->left > seqloc->ssr->right) {
00817          /* Shift the pointer to the next link in chain and free this link. */
00818          if (last_loc)
00819             last_loc->next = seqloc->next;
00820          seqloc = BlastSeqLocNodeFree(seqloc);
00821       } else if (!head_loc) {
00822          /* First time a mask was found within the range. */
00823          head_loc = last_loc = seqloc;
00824       } else {
00825          /* Append to the previous masks. */
00826          last_loc->next = seqloc;
00827          last_loc = last_loc->next;
00828       }
00829    }
00830    *mask = head_loc;
00831 }
00832 
00833 Int2 
00834 Blast_SetPHIPatternInfo(EBlastProgramType            program,
00835                         const SPHIPatternSearchBlk * pattern_blk,
00836                         const BLAST_SequenceBlk    * query,
00837                         const BlastSeqLoc          * lookup_segments,
00838                         BlastQueryInfo             * query_info,
00839                         Blast_Message** blast_message)
00840 {
00841     const Boolean kIsNa = (program == eBlastTypePhiBlastn);
00842     Int4 num_patterns = 0;
00843     
00844     ASSERT(Blast_ProgramIsPhiBlast(program));
00845     ASSERT(query_info && pattern_blk);
00846     
00847     query_info->pattern_info = SPHIQueryInfoNew();
00848     
00849     /* If pattern is not found in query, return failure status. */
00850     num_patterns = PHIGetPatternOccurrences(pattern_blk, query, lookup_segments, kIsNa,
00851                                   query_info);
00852     if (num_patterns == 0)
00853     {
00854        char buffer[512]; 
00855        sprintf(buffer, "The pattern %s was not found in the query.", pattern_blk->pattern);
00856        if (blast_message)
00857            Blast_MessageWrite(blast_message, eBlastSevWarning, kBlastMessageNoContext, buffer);
00858        return -1;
00859     }
00860     else if (num_patterns == INT4_MAX)
00861     {
00862        char buffer[512]; 
00863        sprintf(buffer, "The pattern (%s) may not cover the entire query.", pattern_blk->pattern);
00864        if (blast_message)
00865            Blast_MessageWrite(blast_message, eBlastSevWarning, kBlastMessageNoContext, buffer);
00866        return -1;
00867     }
00868     else if (num_patterns < 0)
00869     {
00870        return -1;
00871     }
00872     
00873     /* Save pattern probability, because it needs to be passed back to
00874        formatting stage, where lookup table will not be available. */
00875     query_info->pattern_info->probability = pattern_blk->patternProbability;
00876 
00877    /* Also needed for formatting. */
00878     query_info->pattern_info->pattern = 
00879         (char*) (char *) BlastMemDup(pattern_blk->pattern, 1+strlen(pattern_blk->pattern));
00880 
00881     /* Save minimal pattern length in the length adjustment field, because 
00882        that is essentially its meaning. */
00883     query_info->contexts[0].length_adjustment = 
00884         pattern_blk->minPatternMatchLength;
00885 
00886     return 0;
00887 }
00888 
00889 

Generated on Sun Dec 6 22:17:21 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Dec 07 16:20:50 2009 by modify_doxy.py rev. 173732