00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #ifndef SKIP_DOXYGEN_PROCESSING
00036 static char const rcsid[] =
00037 "$Id: blast_kappa.c 148871 2009-01-05 16:51:12Z camacho $";
00038 #endif
00039
00040 #include <float.h>
00041 #include <algo/blast/core/ncbi_math.h>
00042 #include <algo/blast/core/blast_hits.h>
00043 #include <algo/blast/core/blast_kappa.h>
00044 #include <algo/blast/core/blast_util.h>
00045 #include <algo/blast/core/blast_gapalign.h>
00046 #include <algo/blast/core/blast_filter.h>
00047 #include <algo/blast/core/blast_traceback.h>
00048 #include <algo/blast/core/link_hsps.h>
00049 #include <algo/blast/core/gencode_singleton.h>
00050 #include "blast_psi_priv.h"
00051 #include "blast_gapalign_priv.h"
00052 #include "blast_hits_priv.h"
00053 #include "blast_posit.h"
00054
00055 #include <algo/blast/composition_adjustment/nlm_linear_algebra.h>
00056 #include <algo/blast/composition_adjustment/compo_heap.h>
00057 #include <algo/blast/composition_adjustment/redo_alignment.h>
00058 #include <algo/blast/composition_adjustment/matrix_frequency_data.h>
00059 #include <algo/blast/composition_adjustment/unified_pvalues.h>
00060
00061
00062
00063
00064
00065
00066
00067 #ifndef KAPPA_BLASTP_NO_SEG_SEQUENCE
00068 #define KAPPA_BLASTP_NO_SEG_SEQUENCE 0
00069 #endif
00070
00071
00072
00073
00074
00075 #ifndef KAPPA_TBLASTN_NO_SEG_SEQUENCE
00076 #define KAPPA_TBLASTN_NO_SEG_SEQUENCE 0
00077 #endif
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092 static void
00093 s_HSPListNormalizeScores(BlastHSPList * hsp_list,
00094 double lambda,
00095 double logK,
00096 double scoreDivisor)
00097 {
00098 int hsp_index;
00099 for(hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
00100 BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
00101
00102 hsp->score = BLAST_Nint(((double) hsp->score) / scoreDivisor);
00103
00104 hsp->bit_score = (hsp->score*lambda*scoreDivisor - logK)/NCBIMATH_LN2;
00105 }
00106 }
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125 static void
00126 s_AdjustEvaluesForComposition(
00127 BlastHSPList *hsp_list,
00128 double comp_p_value,
00129 const BlastSeqSrc* seqSrc,
00130 Int4 subject_length,
00131 BlastContextInfo * query_context,
00132 double LambdaRatio,
00133 int subject_id)
00134 {
00135
00136 double best_evalue = DBL_MAX;
00137
00138
00139 int query_length = query_context->query_length;
00140
00141 int length_adjustment = query_context->length_adjustment;
00142
00143
00144 double query_eff = MAX((query_length - length_adjustment), 1);
00145 double subject_eff = MAX((subject_length - length_adjustment), 1.0);
00146 double dblen_eff = (double) query_context->eff_searchsp / query_eff;
00147
00148
00149 double db_to_sequence_scale = subject_eff / dblen_eff;
00150
00151 int hsp_index;
00152 for (hsp_index = 0; hsp_index < hsp_list->hspcnt; hsp_index++) {
00153
00154 double align_p_value;
00155 double combined_p_value;
00156
00157
00158 BlastHSP * hsp = hsp_list->hsp_array[hsp_index];
00159 #ifdef KAPPA_PRINT_DIAGNOSTICS
00160
00161 double old_e_value = hsp->evalue;
00162 #endif
00163 hsp->evalue *= db_to_sequence_scale;
00164
00165 align_p_value = BLAST_KarlinEtoP(hsp->evalue);
00166 combined_p_value = Blast_Overall_P_Value(comp_p_value,align_p_value);
00167 hsp->evalue = BLAST_KarlinPtoE(combined_p_value);
00168 hsp->evalue /= db_to_sequence_scale;
00169
00170 if (hsp->evalue < best_evalue) {
00171 best_evalue = hsp->evalue;
00172 }
00173
00174 #ifdef KAPPA_PRINT_DIAGNOSTICS
00175 {{
00176 int sequence_gi;
00177 Blast_GiList* gi_list;
00178 gi_list = BlastSeqSrcGetGis(seqSrc, (void *) (&subject_id));
00179 if ((gi_list) && (gi_list->num_used > 0)) {
00180 sequence_gi = gi_list->data[0];
00181 } else {
00182 sequence_gi = (-1);
00183 }
00184 printf("GI %d Lambda ratio %e comp. p-value %e; "
00185 "adjust E-value of query length %d match length "
00186 "%d from %e to %e\n",
00187 sequence_gi, LambdaRatio, comp_p_value,
00188 query_length, subject_length, old_e_value, hsp->evalue);
00189 Blast_GiListFree(gi_list);
00190 }}
00191 #endif
00192 }
00193
00194 hsp_list->best_evalue = best_evalue;
00195
00196
00197 (void) seqSrc;
00198 (void) query_length;
00199 (void) LambdaRatio;
00200 (void) subject_id;
00201 }
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214 static void
00215 s_HitlistReapContained(BlastHSP * hsp_array[], Int4 * hspcnt)
00216 {
00217 Int4 iread;
00218 Int4 iwrite;
00219 Int4 old_hspcnt;
00220
00221 old_hspcnt = *hspcnt;
00222
00223 for (iread = 1; iread < *hspcnt; iread++) {
00224
00225 Int4 ireadBack;
00226 BlastHSP *hsp1;
00227
00228 hsp1 = hsp_array[iread];
00229 for (ireadBack = 0; ireadBack < iread && hsp1 != NULL; ireadBack++) {
00230
00231
00232 BlastHSP *hsp2;
00233
00234 hsp2 = hsp_array[ireadBack];
00235
00236 if( hsp2 == NULL ) {
00237 continue;
00238 }
00239 if (SIGN(hsp2->query.frame) == SIGN(hsp1->query.frame) &&
00240 SIGN(hsp2->subject.frame) == SIGN(hsp1->subject.frame)) {
00241
00242 if (CONTAINED_IN_HSP
00243 (hsp2->query.offset, hsp2->query.end, hsp1->query.offset,
00244 hsp2->subject.offset, hsp2->subject.end,
00245 hsp1->subject.offset) &&
00246 CONTAINED_IN_HSP
00247 (hsp2->query.offset, hsp2->query.end, hsp1->query.end,
00248 hsp2->subject.offset, hsp2->subject.end,
00249 hsp1->subject.end) &&
00250 hsp1->score <= hsp2->score) {
00251 hsp1 = hsp_array[iread] = Blast_HSPFree(hsp_array[iread]);
00252 }
00253 }
00254 }
00255 }
00256
00257
00258 iwrite = 0;
00259 for (iread = 0; iread < *hspcnt; iread++) {
00260 if (hsp_array[iread] != NULL) {
00261 hsp_array[iwrite++] = hsp_array[iread];
00262 }
00263 }
00264 *hspcnt = iwrite;
00265
00266 for ( ; iwrite < old_hspcnt; iwrite++) {
00267 hsp_array[iwrite] = NULL;
00268 }
00269 }
00270
00271
00272
00273
00274 static void s_FreeEditScript(void * edit_script)
00275 {
00276 if (edit_script != NULL)
00277 GapEditScriptDelete(edit_script);
00278 }
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293 static BlastHSPList *
00294 s_HSPListFromDistinctAlignments(BlastCompo_Alignment ** alignments,
00295 int oid,
00296 BlastQueryInfo* queryInfo)
00297 {
00298 int status = 0;
00299 static const int unknown_value = 0;
00300
00301 BlastCompo_Alignment * align;
00302 BlastHSPList * hsp_list;
00303
00304 hsp_list = Blast_HSPListNew(0);
00305 if (hsp_list == NULL) {
00306 return NULL;
00307 }
00308 hsp_list->oid = oid;
00309
00310 for (align = *alignments; NULL != align; align = align->next) {
00311 BlastHSP * new_hsp = NULL;
00312 GapEditScript * editScript = align->context;
00313 int query_offset, queryStart, queryEnd;
00314 align->context = NULL;
00315
00316 query_offset = queryInfo->contexts[align->queryIndex].query_offset;
00317 queryStart = align->queryStart - query_offset;
00318 queryEnd = align->queryEnd - query_offset;
00319
00320 status = Blast_HSPInit(queryStart, queryEnd,
00321 align->matchStart, align->matchEnd,
00322 unknown_value, unknown_value,
00323 align->queryIndex,
00324 0, (Int2) align->frame, align->score,
00325 &editScript, &new_hsp);
00326 switch (align->matrix_adjust_rule) {
00327 case eDontAdjustMatrix:
00328 new_hsp->comp_adjustment_method = eNoCompositionBasedStats;
00329 break;
00330 case eCompoScaleOldMatrix:
00331 new_hsp->comp_adjustment_method = eCompositionBasedStats;
00332 break;
00333 default:
00334 new_hsp->comp_adjustment_method = eCompositionMatrixAdjust;
00335 break;
00336 }
00337 if (status != 0)
00338 break;
00339
00340
00341
00342
00343 new_hsp->num_ident = 0;
00344
00345 status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
00346 if (status != 0)
00347 break;
00348 }
00349 if (status == 0) {
00350 BlastCompo_AlignmentsFree(alignments, s_FreeEditScript);
00351 Blast_HSPListSortByScore(hsp_list);
00352 } else {
00353 hsp_list = Blast_HSPListFree(hsp_list);
00354 }
00355 return hsp_list;
00356 }
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382 static int
00383 s_HitlistEvaluateAndPurge(int * pbestScore, double *pbestEvalue,
00384 BlastHSPList * hsp_list,
00385 const BlastSeqSrc* seqSrc,
00386 int subject_length,
00387 EBlastProgramType program_number,
00388 BlastQueryInfo* queryInfo,
00389 int query_index,
00390 BlastScoreBlk* sbp,
00391 const BlastHitSavingParameters* hitParams,
00392 double pvalueForThisPair,
00393 double LambdaRatio,
00394 int subject_id)
00395 {
00396 int status = 0;
00397 *pbestEvalue = DBL_MAX;
00398 *pbestScore = 0;
00399 if (hitParams->do_sum_stats) {
00400 status = BLAST_LinkHsps(program_number, hsp_list, queryInfo,
00401 subject_length, sbp,
00402 hitParams->link_hsp_params, TRUE);
00403 } else {
00404 status =
00405 Blast_HSPListGetEvalues(queryInfo, hsp_list, TRUE, sbp,
00406 0.0,
00407
00408 1.0);
00409
00410
00411
00412 }
00413 if (eBlastTypeBlastp == program_number) {
00414 if ((0 <= pvalueForThisPair) && (pvalueForThisPair <= 1)) {
00415 s_AdjustEvaluesForComposition(hsp_list, pvalueForThisPair, seqSrc,
00416 subject_length,
00417 &queryInfo->contexts[query_index],
00418 LambdaRatio, subject_id);
00419 }
00420 }
00421 if (status == 0) {
00422 Blast_HSPListReapByEvalue(hsp_list, hitParams->options);
00423 if (hsp_list->hspcnt > 0) {
00424 *pbestEvalue = hsp_list->best_evalue;
00425 *pbestScore = hsp_list->hsp_array[0]->score;
00426 }
00427 }
00428 return status == 0 ? 0 : -1;
00429 }
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441 static void
00442 s_ComputeNumIdentities(const BLAST_SequenceBlk* query_blk,
00443 const BlastQueryInfo* query_info,
00444 const BlastSeqSrc* seq_src,
00445 BlastHSPList* hsp_list,
00446 const BlastScoringOptions* scoring_options,
00447 const Uint1* gen_code_string)
00448 {
00449 Uint1* query = NULL;
00450 Uint1* subject = NULL;
00451 const EBlastProgramType program_number = scoring_options->program_number;
00452 const Boolean kIsOutOfFrame = scoring_options->is_ooframe;
00453 const EBlastEncoding encoding = Blast_TracebackGetEncoding(program_number);
00454 BlastSeqSrcGetSeqArg seq_arg;
00455 BLAST_SequenceBlk* subject_blk = NULL;
00456 Int2 status = 0;
00457 int i;
00458 SBlastTargetTranslation* target_t = NULL;
00459
00460 if ( !hsp_list || (program_number != eBlastTypeBlastp && program_number != eBlastTypeTblastn))
00461 return;
00462
00463
00464 {
00465 memset((void*) &seq_arg, 0, sizeof(seq_arg));
00466 seq_arg.oid = hsp_list->oid;
00467 seq_arg.encoding = encoding;
00468 status = BlastSeqSrcGetSequence(seq_src, (void*) &seq_arg);
00469 ASSERT(status == 0);
00470 }
00471
00472 if (program_number == eBlastTypeTblastn) {
00473 subject_blk = seq_arg.seq;
00474 BlastTargetTranslationNew(subject_blk, gen_code_string, eBlastTypeTblastn,
00475 kIsOutOfFrame, &target_t);
00476 }
00477 else {
00478 subject = seq_arg.seq->sequence;
00479 }
00480
00481 for (i = 0; i < hsp_list->hspcnt; i++) {
00482 BlastHSP* hsp = hsp_list->hsp_array[i];
00483
00484
00485 if (program_number == eBlastTypeBlastx && kIsOutOfFrame) {
00486 Int4 context = hsp->context - hsp->context % CODON_LENGTH;
00487 Int4 context_offset = query_info->contexts[context].query_offset;
00488 query = query_blk->oof_sequence + CODON_LENGTH + context_offset;
00489 } else {
00490 query = query_blk->sequence +
00491 query_info->contexts[hsp->context].query_offset;
00492 }
00493
00494
00495 if (program_number == eBlastTypeTblastn) {
00496 const Uint1* target_sequence = Blast_HSPGetTargetTranslation(target_t, hsp, NULL);
00497 status = Blast_HSPGetNumIdentities(query, target_sequence, hsp, scoring_options, 0);
00498 }
00499 else
00500 status = Blast_HSPGetNumIdentities(query, subject, hsp, scoring_options, 0);
00501
00502 ASSERT(status == 0);
00503 }
00504 target_t = BlastTargetTranslationFree(target_t);
00505 BlastSeqSrcReleaseSequence(seq_src, (void*) &seq_arg);
00506 BlastSequenceBlkFree(seq_arg.seq);
00507 }
00508
00509
00510
00511
00512
00513
00514
00515 static double
00516 s_CalcLambda(double probs[], int min_score, int max_score, double lambda0)
00517 {
00518
00519 int i;
00520 int score_range;
00521 double avg;
00522 Blast_ScoreFreq freq;
00523
00524 score_range = max_score - min_score + 1;
00525 avg = 0.0;
00526 for (i = 0; i < score_range; i++) {
00527 avg += (min_score + i) * probs[i];
00528 }
00529 freq.score_min = min_score;
00530 freq.score_max = max_score;
00531 freq.obs_min = min_score;
00532 freq.obs_max = max_score;
00533 freq.sprob0 = probs;
00534 freq.sprob = &probs[-min_score];
00535 freq.score_avg = avg;
00536
00537 return Blast_KarlinLambdaNR(&freq, lambda0);
00538 }
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556 static int
00557 s_GetPosBasedStartFreqRatios(double ** returnRatios,
00558 Int4 numPositions,
00559 Uint1 * query,
00560 const char *matrixName,
00561 double **startNumerator)
00562 {
00563 Int4 i,j;
00564 SFreqRatios * stdFreqRatios = NULL;
00565
00566 double *standardProb;
00567
00568 const double kPosEpsilon = 0.0001;
00569
00570
00571 stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
00572 if (stdFreqRatios == NULL) {
00573 return -1;
00574 }
00575 for (i = 0; i < numPositions; i++) {
00576 for (j = 0; j < BLASTAA_SIZE; j++) {
00577 returnRatios[i][j] = stdFreqRatios->data[query[i]][j];
00578 }
00579 }
00580 stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
00581
00582 standardProb = BLAST_GetStandardAaProbabilities();
00583 if(standardProb == NULL) {
00584 return -1;
00585 }
00586
00587 for (i = 0; i < numPositions; i++) {
00588 for (j = 0; j < BLASTAA_SIZE; j++) {
00589 if ((standardProb[query[i]] > kPosEpsilon) &&
00590 (standardProb[j] > kPosEpsilon) &&
00591 (j != eStopChar) && (j != eXchar) &&
00592 (startNumerator[i][j] > kPosEpsilon)) {
00593 returnRatios[i][j] = startNumerator[i][j] / standardProb[j];
00594 }
00595 }
00596 }
00597 sfree(standardProb);
00598
00599 return 0;
00600 }
00601
00602
00603
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613 static int
00614 s_GetStartFreqRatios(double ** returnRatios,
00615 const char *matrixName)
00616 {
00617
00618 int i,j;
00619
00620 SFreqRatios * stdFreqRatios = NULL;
00621
00622 stdFreqRatios = _PSIMatrixFrequencyRatiosNew(matrixName);
00623 if (stdFreqRatios == NULL) {
00624 return -1;
00625 }
00626 for (i = 0; i < BLASTAA_SIZE; i++) {
00627 for (j = 0; j < BLASTAA_SIZE; j++) {
00628 returnRatios[i][j] = stdFreqRatios->data[i][j];
00629 }
00630 }
00631 stdFreqRatios = _PSIMatrixFrequencyRatiosFree(stdFreqRatios);
00632
00633 return 0;
00634 }
00635
00636
00637
00638
00639
00640
00641 #define SCALING_FACTOR 32
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658 static int
00659 s_ScalePosMatrix(int ** fillPosMatrix,
00660 const char * matrixName,
00661 double ** posFreqs,
00662 Uint1 * query,
00663 int queryLength,
00664 BlastScoreBlk* sbp,
00665 double scale_factor)
00666 {
00667
00668 Kappa_posSearchItems *posSearch = NULL;
00669
00670 Kappa_compactSearchItems *compactSearch = NULL;
00671
00672 _PSIInternalPssmData* internal_pssm = NULL;
00673
00674 int status = 0;
00675
00676 posSearch = Kappa_posSearchItemsNew(queryLength, matrixName,
00677 fillPosMatrix, posFreqs);
00678 compactSearch = Kappa_compactSearchItemsNew(query, queryLength, sbp);
00679
00680 internal_pssm = _PSIInternalPssmDataNew(queryLength, BLASTAA_SIZE);
00681 if (posSearch == NULL || compactSearch == NULL || internal_pssm == NULL) {
00682 status = -1;
00683 goto cleanup;
00684 }
00685 _PSICopyMatrix_int(internal_pssm->pssm, posSearch->posMatrix,
00686 internal_pssm->ncols, internal_pssm->nrows);
00687 _PSICopyMatrix_int(internal_pssm->scaled_pssm,
00688 posSearch->posPrivateMatrix,
00689 internal_pssm->ncols, internal_pssm->nrows);
00690 _PSICopyMatrix_double(internal_pssm->freq_ratios,
00691 posSearch->posFreqs, internal_pssm->ncols,
00692 internal_pssm->nrows);
00693 status = _PSIConvertFreqRatiosToPSSM(internal_pssm, query, sbp,
00694 compactSearch->standardProb);
00695 if (status != 0) {
00696 goto cleanup;
00697 }
00698
00699 _PSICopyMatrix_int(posSearch->posMatrix, internal_pssm->pssm,
00700 internal_pssm->ncols, internal_pssm->nrows);
00701 _PSICopyMatrix_int(posSearch->posPrivateMatrix,
00702 internal_pssm->scaled_pssm,
00703 internal_pssm->ncols, internal_pssm->nrows);
00704 _PSICopyMatrix_double(posSearch->posFreqs,
00705 internal_pssm->freq_ratios,
00706 internal_pssm->ncols, internal_pssm->nrows);
00707 status = Kappa_impalaScaling(posSearch, compactSearch, (double)
00708 scale_factor, FALSE, sbp);
00709 cleanup:
00710 internal_pssm = _PSIInternalPssmDataFree(internal_pssm);
00711 posSearch = Kappa_posSearchItemsFree(posSearch);
00712 compactSearch = Kappa_compactSearchItemsFree(compactSearch);
00713
00714 return status;
00715 }
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731 static BlastCompo_Alignment *
00732 s_ResultHspToDistinctAlign(BlastHSP * hsp_array[], Int4 hspcnt,
00733 BlastQueryInfo* queryInfo,
00734 double localScalingFactor)
00735 {
00736 BlastCompo_Alignment * aligns = NULL;
00737 BlastCompo_Alignment * tail = NULL;
00738 int hsp_index;
00739
00740 for (hsp_index = 0; hsp_index < hspcnt; hsp_index++) {
00741 BlastHSP * hsp = hsp_array[hsp_index];
00742 BlastCompo_Alignment * new_align;
00743 int queryStart;
00744
00745 int queryOffset, queryEnd;
00746
00747
00748
00749
00750
00751
00752 queryStart = queryInfo->contexts[hsp->context].query_offset;
00753 queryOffset = hsp->query.offset + queryStart;
00754 queryEnd = hsp->query.end + queryStart;
00755 new_align =
00756 BlastCompo_AlignmentNew((int) (hsp->score * localScalingFactor),
00757 eDontAdjustMatrix,
00758 queryOffset, queryEnd, hsp->context,
00759 hsp->subject.offset, hsp->subject.end,
00760 hsp->subject.frame, hsp);
00761 if (new_align == NULL)
00762 goto error_return;
00763 if (tail == NULL) {
00764
00765 aligns = new_align;
00766 } else {
00767
00768 tail->next = new_align;
00769 }
00770 tail = new_align;
00771 }
00772 goto normal_return;
00773 error_return:
00774 BlastCompo_AlignmentsFree(&aligns, NULL);
00775 normal_return:
00776 return aligns;
00777 }
00778
00779
00780
00781
00782
00783
00784
00785
00786
00787
00788
00789
00790
00791
00792
00793
00794
00795
00796
00797
00798
00799
00800
00801
00802
00803
00804
00805
00806 static void
00807 s_SWFindFinalEndsUsingXdrop(BlastCompo_SequenceData * query,
00808 Int4 queryStart,
00809 Int4 queryEnd,
00810 BlastCompo_SequenceData * subject,
00811 Int4 matchStart,
00812 Int4 matchEnd,
00813 BlastGapAlignStruct* gap_align,
00814 const BlastScoringParameters* scoringParams,
00815 Int4 score,
00816 Int4 * queryAlignmentExtent,
00817 Int4 * matchAlignmentExtent,
00818 Int4 * newScore)
00819 {
00820 Int4 XdropAlignScore;
00821
00822 Int4 doublingCount = 0;
00823
00824 Int4 gap_x_dropoff_orig = gap_align->gap_x_dropoff;
00825
00826 GapPrelimEditBlockReset(gap_align->rev_prelim_tback);
00827 GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
00828 do {
00829 XdropAlignScore =
00830 ALIGN_EX(&(query->data[queryStart]) - 1,
00831 &(subject->data[matchStart]) - 1,
00832 queryEnd - queryStart + 1, matchEnd - matchStart + 1,
00833 queryAlignmentExtent,
00834 matchAlignmentExtent, gap_align->fwd_prelim_tback,
00835 gap_align, scoringParams, queryStart - 1, FALSE, FALSE,
00836 NULL);
00837
00838 gap_align->gap_x_dropoff *= 2;
00839 doublingCount++;
00840 if((XdropAlignScore < score) && (doublingCount < 3)) {
00841 GapPrelimEditBlockReset(gap_align->fwd_prelim_tback);
00842 }
00843 } while((XdropAlignScore < score) && (doublingCount < 3));
00844
00845 gap_align->gap_x_dropoff = gap_x_dropoff_orig;
00846 *newScore = XdropAlignScore;
00847 }
00848
00849
00850
00851
00852
00853
00854 typedef struct
00855 BlastKappa_SequenceInfo {
00856 EBlastProgramType prog_number;
00857
00858
00859
00860 const BlastSeqSrc* seq_src;
00861 BlastSeqSrcGetSeqArg seq_arg;
00862
00863
00864
00865
00866 } BlastKappa_SequenceInfo;
00867
00868
00869
00870 static void
00871 s_MatchingSequenceRelease(BlastCompo_MatchingSequence * self)
00872 {
00873 if (self != NULL) {
00874 BlastKappa_SequenceInfo * local_data = self->local_data;
00875 BlastSeqSrcReleaseSequence(local_data->seq_src,
00876 &local_data->seq_arg);
00877 BlastSequenceBlkFree(local_data->seq_arg.seq);
00878 free(self->local_data);
00879 self->local_data = NULL;
00880 }
00881 }
00882
00883
00884
00885
00886
00887
00888
00889
00890
00891
00892
00893
00894
00895 static Boolean
00896 s_TestNearIdentical(const BlastCompo_SequenceData *seqData,
00897 const BlastCompo_SequenceData *queryData,
00898 const int queryOffset,
00899 const BlastCompo_Alignment *align,
00900 const int rangeOffset)
00901 {
00902 int numIdentical = 0;
00903 double fractionIdentical;
00904 int qPos, sPos;
00905 int qEnd;
00906 const double kMinFractionNearIdentical = 0.98;
00907
00908 qPos = align->queryStart - queryOffset;
00909 qEnd = align->queryEnd - queryOffset;
00910 sPos = align->matchStart - rangeOffset;
00911 while (qPos < qEnd) {
00912 if (queryData->data[qPos] == seqData->data[sPos])
00913 numIdentical++;
00914 sPos++;
00915 qPos++;
00916 }
00917 fractionIdentical = ((double) numIdentical/
00918 (double) (align->queryEnd - align->queryStart));
00919 if (fractionIdentical >= kMinFractionNearIdentical)
00920 return(TRUE);
00921 else
00922 return(FALSE);
00923 }
00924
00925
00926
00927
00928
00929
00930
00931
00932
00933
00934
00935
00936
00937
00938
00939
00940 static int
00941 s_MatchingSequenceInitialize(BlastCompo_MatchingSequence * self,
00942 EBlastProgramType program_number,
00943 const BlastSeqSrc* seqSrc,
00944 Int4 default_db_genetic_code,
00945 Int4 subject_index)
00946 {
00947 BlastKappa_SequenceInfo * seq_info;
00948
00949 self->length = 0;
00950 self->local_data = NULL;
00951
00952 seq_info = malloc(sizeof(BlastKappa_SequenceInfo));
00953 if (seq_info != NULL) {
00954 self->local_data = seq_info;
00955
00956 seq_info->seq_src = seqSrc;
00957 seq_info->prog_number = program_number;
00958
00959 memset((void*) &seq_info->seq_arg, 0, sizeof(seq_info->seq_arg));
00960 seq_info->seq_arg.oid = self->index = subject_index;
00961
00962 if( program_number == eBlastTypeTblastn ) {
00963 seq_info->seq_arg.encoding = eBlastEncodingNcbi4na;
00964 } else {
00965 seq_info->seq_arg.encoding = eBlastEncodingProtein;
00966 }
00967 if (BlastSeqSrcGetSequence(seqSrc, &seq_info->seq_arg) >= 0) {
00968 self->length =
00969 BlastSeqSrcGetSeqLen(seqSrc, (void*) &seq_info->seq_arg);
00970
00971
00972
00973
00974 if (Blast_SubjectIsTranslated(program_number) &&
00975 seq_info->seq_arg.seq->gen_code_string == NULL) {
00976 seq_info->seq_arg.seq->gen_code_string =
00977 GenCodeSingletonFind(default_db_genetic_code);
00978 ASSERT(seq_info->seq_arg.seq->gen_code_string);
00979 }
00980 } else {
00981 self->length = 0;
00982 }
00983 }
00984 if (self->length == 0) {
00985
00986 s_MatchingSequenceRelease(self);
00987 return -1;
00988 } else {
00989 return 0;
00990 }
00991 }
00992
00993
00994
00995 #define BLASTP_MASK_RESIDUE 21
00996
00997 #define BLASTP_MASK_INSTRUCTIONS "S 10 1.8 2.1"
00998
00999
01000
01001
01002
01003
01004
01005
01006
01007
01008 static int
01009 s_DoSegSequenceData(BlastCompo_SequenceData * seqData,
01010 EBlastProgramType program_name)
01011 {
01012 int status = 0;
01013 BlastSeqLoc* mask_seqloc = NULL;
01014 SBlastFilterOptions* filter_options = NULL;
01015
01016 status = BlastFilteringOptionsFromString(program_name,
01017 BLASTP_MASK_INSTRUCTIONS,
01018 &filter_options, NULL);
01019 if (status == 0) {
01020 status = BlastSetUp_Filter(program_name, seqData->data,
01021 seqData->length, 0, filter_options,
01022 &mask_seqloc, NULL);
01023 filter_options = SBlastFilterOptionsFree(filter_options);
01024 }
01025 if (status == 0) {
01026 Blast_MaskTheResidues(seqData->data, seqData->length,
01027 FALSE, mask_seqloc, FALSE, 0);
01028 }
01029 if (mask_seqloc != NULL) {
01030 mask_seqloc = BlastSeqLocFree(mask_seqloc);
01031 }
01032 return status;
01033 }
01034
01035
01036
01037
01038
01039
01040
01041
01042
01043
01044
01045
01046
01047
01048
01049
01050
01051 static int
01052 s_SequenceGetTranslatedRange(const BlastCompo_MatchingSequence * self,
01053 const BlastCompo_SequenceRange * range,
01054 BlastCompo_SequenceData * seqData,
01055 const BlastCompo_SequenceData * queryData,
01056 const int queryOffset,
01057 const BlastCompo_Alignment *align,
01058 const Boolean shouldTestIdentical)
01059 {
01060 int status = 0;
01061 BlastKappa_SequenceInfo * local_data;
01062
01063
01064 Uint1 * translation_buffer;
01065
01066 Int4 translated_length;
01067 int translation_frame;
01068 Uint1 * na_sequence;
01069 int translation_start;
01070
01071 int num_nucleotides;
01072
01073 local_data = self->local_data;
01074 na_sequence = local_data->seq_arg.seq->sequence_start;
01075
01076
01077 seqData->buffer = NULL;
01078 seqData->data = NULL;
01079 seqData->length = 0;
01080
01081 translation_frame = range->context;
01082 if (translation_frame > 0) {
01083 translation_start = 3 * range->begin;
01084 } else {
01085 translation_start =
01086 self->length - 3 * range->end + translation_frame + 1;
01087 }
01088 num_nucleotides =
01089 3 * (range->end - range->begin) + ABS(translation_frame) - 1;
01090
01091 status = Blast_GetPartialTranslation(na_sequence + translation_start,
01092 num_nucleotides,
01093 (Int2) translation_frame,
01094 local_data->seq_arg.seq->gen_code_string,
01095 &translation_buffer,
01096 &translated_length,
01097 NULL);
01098 if (status == 0) {
01099 seqData->buffer = translation_buffer;
01100 seqData->data = translation_buffer + 1;
01101 seqData->length = translated_length;
01102
01103 if ( !(KAPPA_TBLASTN_NO_SEG_SEQUENCE) ) {
01104 if ((!shouldTestIdentical) ||
01105 (shouldTestIdentical &&
01106 (!s_TestNearIdentical(seqData, queryData, queryOffset, align, range->begin)))) {
01107 status = s_DoSegSequenceData(seqData, eBlastTypeTblastn);
01108 if (status != 0) {
01109 free(seqData->buffer);
01110 seqData->buffer = NULL;
01111 seqData->data = NULL;
01112 seqData->length = 0;
01113 }
01114 }
01115 }
01116 }
01117 return status;
01118 }
01119
01120
01121
01122
01123
01124
01125
01126
01127
01128
01129
01130
01131
01132
01133
01134
01135
01136
01137 static int
01138 s_SequenceGetProteinRange(const BlastCompo_MatchingSequence * self,
01139 const BlastCompo_SequenceRange * range,
01140 BlastCompo_SequenceData * seqData,
01141 const BlastCompo_SequenceData * queryData,
01142 const int queryOffset,
01143 const BlastCompo_Alignment *align,
01144 const Boolean shouldTestIdentical)
01145
01146 {
01147 int status = 0;
01148 Int4 idx;
01149 Uint1 *origData;
01150
01151 BlastKappa_SequenceInfo * local_data = self->local_data;
01152
01153 seqData->data = NULL;
01154 seqData->length = 0;
01155
01156 seqData->buffer = calloc((self->length + 2), sizeof(Uint1));
01157 if (seqData->buffer == NULL) {
01158 return -1;
01159 }
01160
01161
01162 seqData->data = seqData->buffer + 1;
01163 seqData->length = self->length;
01164
01165 origData = local_data->seq_arg.seq->sequence;
01166 for (idx = 0; idx < seqData->length; idx++) {
01167
01168
01169
01170 if (origData[idx] != 24) {
01171 seqData->data[idx] = origData[idx];
01172 } else {
01173 seqData->data[idx] = 21;
01174 fprintf(stderr, "Selenocysteine (U) at position %ld"
01175 " replaced by X\n",
01176 (long) idx + 1);
01177 }
01178 }
01179 if ( !(KAPPA_BLASTP_NO_SEG_SEQUENCE) ) {
01180 if ((!shouldTestIdentical) ||
01181 (shouldTestIdentical &&
01182 (!s_TestNearIdentical(seqData, queryData, queryOffset, align, 0)))) {
01183 status = -1;
01184 status = s_DoSegSequenceData(seqData, eBlastTypeBlastp);
01185 }
01186 }
01187
01188 seqData ->data = &seqData->data[range->begin - 1];
01189 *seqData->data++ = '\0';
01190 seqData ->length = range->end - range->begin;
01191
01192 if (status != 0) {
01193 free(seqData->buffer);
01194 seqData->buffer = NULL;
01195 seqData->data = NULL;
01196 }
01197 return status;
01198 }
01199
01200
01201
01202
01203
01204
01205
01206
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217 static int
01218 s_SequenceGetRange(const BlastCompo_MatchingSequence * self,
01219 const BlastCompo_SequenceRange * range,
01220 BlastCompo_SequenceData * seqData,
01221 const BlastCompo_SequenceData * queryData,
01222 const int queryOffset,
01223 const BlastCompo_Alignment *align,
01224 const Boolean shouldTestIdentical)
01225 {
01226 BlastKappa_SequenceInfo * seq_info = self->local_data;
01227 if (seq_info->prog_number == eBlastTypeTblastn) {
01228
01229 return s_SequenceGetTranslatedRange(self, range, seqData,
01230 queryData, queryOffset, align, shouldTestIdentical);
01231 } else {
01232 return s_SequenceGetProteinRange(self, range, seqData, queryData, queryOffset, align, shouldTestIdentical);
01233 }
01234 }
01235
01236
01237
01238 typedef struct BlastKappa_GappingParamsContext {
01239 const BlastScoringParameters*
01240 scoringParams;
01241
01242 BlastGapAlignStruct * gap_align;
01243
01244 BlastScoreBlk* sbp;
01245 double localScalingFactor;
01246
01247 EBlastProgramType prog_number;
01248
01249 } BlastKappa_GappingParamsContext;
01250
01251
01252
01253
01254
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265
01266
01267
01268
01269
01270 static BlastCompo_Alignment *
01271 s_NewAlignmentFromGapAlign(BlastGapAlignStruct * gap_align,
01272 GapEditScript ** edit_script,
01273 BlastCompo_SequenceRange * query_range,
01274 BlastCompo_SequenceRange * subject_range,
01275 EMatrixAdjustRule matrix_adjust_rule)
01276 {
01277
01278 int queryStart, queryEnd, queryIndex, matchStart, matchEnd, frame;
01279 BlastCompo_Alignment * obj;
01280
01281
01282
01283 queryStart = gap_align->query_start + query_range->begin;
01284 queryEnd = gap_align->query_stop + query_range->begin;
01285 queryIndex = query_range->context;
01286 matchStart = gap_align->subject_start + subject_range->begin;
01287 matchEnd = gap_align->subject_stop + subject_range->begin;
01288 frame = subject_range->context;
01289
01290 obj = BlastCompo_AlignmentNew(gap_align->score, matrix_adjust_rule,
01291 queryStart, queryEnd, queryIndex,
01292 matchStart, matchEnd, frame,
01293 *edit_script);
01294 if (obj != NULL) {
01295 *edit_script = NULL;
01296 }
01297 return obj;
01298 }
01299
01300
01301
01302
01303
01304
01305
01306
01307
01308
01309
01310
01311
01312
01313
01314
01315
01316
01317
01318
01319
01320
01321
01322
01323
01324
01325
01326
01327
01328
01329
01330
01331
01332
01333
01334
01335 static int
01336 s_NewAlignmentUsingXdrop(BlastCompo_Alignment ** pnewAlign,
01337 Int4 * pqueryEnd, Int4 *pmatchEnd,
01338 Int4 queryStart, Int4 matchStart, Int4 score,
01339 BlastCompo_SequenceData * query,
01340 BlastCompo_SequenceRange * query_range,
01341 Int4 ccat_query_length,
01342 BlastCompo_SequenceData * subject,
01343 BlastCompo_SequenceRange * subject_range,
01344 Int4 full_subject_length,
01345 BlastCompo_GappingParams * gapping_params,
01346 EMatrixAdjustRule matrix_adjust_rule)
01347 {
01348 Int4 newScore;
01349
01350
01351
01352 Int4 queryExtent, matchExtent;
01353 BlastCompo_Alignment * obj = NULL;
01354
01355 BlastKappa_GappingParamsContext * context = gapping_params->context;
01356
01357 BlastGapAlignStruct * gap_align = context->gap_align;
01358
01359 const BlastScoringParameters* scoringParams = context->scoringParams;
01360
01361 GapEditScript* editScript = NULL;
01362
01363
01364
01365 (void) ccat_query_length;
01366 (void) full_subject_length;
01367
01368 gap_align->gap_x_dropoff = gapping_params->x_dropoff;
01369
01370 s_SWFindFinalEndsUsingXdrop(query, queryStart, *pqueryEnd,
01371 subject, matchStart, *pmatchEnd,
01372 gap_align, scoringParams,
01373 score, &queryExtent, &matchExtent,
01374 &newScore);
01375 *pqueryEnd = queryStart + queryExtent;
01376 *pmatchEnd = matchStart + matchExtent;
01377
01378 editScript =
01379 Blast_PrelimEditBlockToGapEditScript(gap_align->rev_prelim_tback,
01380 gap_align->fwd_prelim_tback);
01381 if (editScript != NULL) {
01382
01383 Int4 aqueryStart = queryStart + query_range->begin;
01384 Int4 aqueryEnd = *pqueryEnd + query_range->begin;
01385 Int4 amatchStart = matchStart + subject_range->begin;
01386 Int4 amatchEnd = *pmatchEnd + subject_range->begin;
01387
01388 obj = BlastCompo_AlignmentNew(newScore, matrix_adjust_rule,
01389 aqueryStart, aqueryEnd,
01390 query_range->context,
01391 amatchStart, amatchEnd,
01392 subject_range->context, editScript);
01393 if (obj == NULL) {
01394 GapEditScriptDelete(editScript);
01395 }
01396 }
01397 *pnewAlign = obj;
01398
01399 return obj != NULL ? 0 : -1;
01400 }
01401
01402
01403
01404
01405
01406
01407
01408
01409
01410
01411
01412
01413
01414
01415
01416
01417
01418
01419
01420
01421 static BlastCompo_Alignment *
01422 s_RedoOneAlignment(BlastCompo_Alignment * in_align,
01423 EMatrixAdjustRule matrix_adjust_rule,
01424 BlastCompo_SequenceData * query_data,
01425 BlastCompo_SequenceRange * query_range,
01426 int ccat_query_length,
01427 BlastCompo_SequenceData * subject_data,
01428 BlastCompo_SequenceRange * subject_range,
01429 int full_subject_length,
01430 BlastCompo_GappingParams * gapping_params)
01431 {
01432 int status;
01433 Int4 q_start, s_start;
01434
01435 BlastKappa_GappingParamsContext * context = gapping_params->context;
01436
01437 BlastScoreBlk* sbp = context->sbp;
01438
01439 BlastGapAlignStruct* gapAlign = context->gap_align;
01440
01441 BlastHSP * hsp = in_align->context;
01442
01443
01444
01445 (void) ccat_query_length;
01446 (void) full_subject_length;
01447
01448
01449
01450
01451 hsp->subject.offset -= subject_range->begin;
01452 hsp->subject.end -= subject_range->begin;
01453 hsp->subject.gapped_start -= subject_range->begin;
01454
01455 if(BLAST_CheckStartForGappedAlignment(hsp, query_data->data,
01456 subject_data->data, sbp)) {
01457
01458 q_start = hsp->query.gapped_start;
01459 s_start = hsp->subject.gapped_start;
01460 } else {
01461
01462
01463 q_start =
01464 BlastGetStartForGappedAlignment(query_data->data,
01465 subject_data->data, sbp,
01466 hsp->query.offset,
01467 hsp->query.end -
01468 hsp->query.offset,
01469 hsp->subject.offset,
01470 hsp->subject.end -
01471 hsp->subject.offset);
01472 s_start =
01473 (hsp->subject.offset - hsp->query.offset) + q_start;
01474 }
01475
01476
01477 hsp->subject.offset += subject_range->begin;
01478 hsp->subject.end += subject_range->begin;
01479 hsp->subject.gapped_start += subject_range->begin;
01480
01481 gapAlign->gap_x_dropoff = gapping_params->x_dropoff;
01482
01483 status =
01484 BLAST_GappedAlignmentWithTraceback(context->prog_number,
01485 query_data->data,
01486 subject_data->data, gapAlign,
01487 context->scoringParams,
01488 q_start, s_start,
01489 query_data->length,
01490 subject_data->length,
01491 NULL);
01492 if (status == 0) {
01493 return s_NewAlignmentFromGapAlign(gapAlign, &gapAlign->edit_script,
01494 query_range, subject_range,
01495 matrix_adjust_rule);
01496 } else {
01497 return NULL;
01498 }
01499 }
01500
01501
01502
01503
01504
01505
01506
01507 typedef struct BlastKappa_SavedParameters {
01508 Int4 gap_open;
01509 Int4 gapExtend;
01510
01511 double scale_factor;
01512 Int4 **origMatrix;
01513 double original_expect_value;
01514
01515
01516 Blast_KarlinBlk** kbp_gap_orig;
01517 Int4 num_queries;
01518 } BlastKappa_SavedParameters;
01519
01520
01521
01522
01523
01524
01525
01526 static void
01527 s_SavedParametersFree(BlastKappa_SavedParameters ** searchParams)
01528 {
01529
01530 BlastKappa_SavedParameters *sp = *searchParams;
01531
01532 if (sp != NULL) {
01533 if (sp->kbp_gap_orig != NULL) {
01534 int i;
01535 for (i = 0; i < sp->num_queries; i++) {
01536 if (sp->kbp_gap_orig[i] != NULL)
01537 Blast_KarlinBlkFree(sp->kbp_gap_orig[i]);
01538 }
01539 free(sp->kbp_gap_orig);
01540 }
01541 if (sp->origMatrix != NULL)
01542 Nlm_Int4MatrixFree(&sp->origMatrix);
01543 }
01544 sfree(*searchParams);
01545 *searchParams = NULL;
01546 }
01547
01548
01549
01550
01551
01552
01553
01554
01555
01556
01557 static BlastKappa_SavedParameters *
01558 s_SavedParametersNew(Int4 rows,
01559 Int4 numQueries,
01560 ECompoAdjustModes compo_adjust_mode,
01561 Boolean positionBased)
01562 {
01563 int i;
01564 BlastKappa_SavedParameters *sp;
01565 sp = malloc(sizeof(BlastKappa_SavedParameters));
01566
01567 if (sp == NULL) {
01568 goto error_return;
01569 }
01570 sp->kbp_gap_orig = NULL;
01571 sp->origMatrix = NULL;
01572
01573 sp->kbp_gap_orig = calloc(numQueries, sizeof(Blast_KarlinBlk*));
01574 if (sp->kbp_gap_orig == NULL) {
01575 goto error_return;
01576 }
01577 sp->num_queries = numQueries;
01578 for (i = 0; i < numQueries; i++) {
01579 sp->kbp_gap_orig[i] = NULL;
01580 }
01581 if (compo_adjust_mode != eNoCompositionBasedStats) {
01582 if (positionBased) {
01583 sp->origMatrix = Nlm_Int4MatrixNew(rows, BLASTAA_SIZE);
01584 } else {
01585 sp->origMatrix = Nlm_Int4MatrixNew(BLASTAA_SIZE, BLASTAA_SIZE);
01586 }
01587 if (sp->origMatrix == NULL)
01588 goto error_return;
01589 }
01590 return sp;
01591 error_return:
01592 s_SavedParametersFree(&sp);
01593 return NULL;
01594 }
01595
01596
01597
01598
01599
01600
01601
01602
01603
01604
01605
01606
01607
01608 static int
01609 s_RecordInitialSearch(BlastKappa_SavedParameters * searchParams,
01610 BlastScoreBlk* sbp,
01611 const BlastScoringParameters* scoring,
01612 int query_length,
01613 ECompoAdjustModes compo_adjust_mode,
01614 Boolean positionBased)
01615 {
01616 int i;
01617
01618 searchParams->gap_open = scoring->gap_open;
01619 searchParams->gapExtend = scoring->gap_extend;
01620 searchParams->scale_factor = scoring->scale_factor;
01621
01622 for (i = 0; i < searchParams->num_queries; i++) {
01623 if (sbp->kbp_gap[i] != NULL) {
01624
01625 searchParams->kbp_gap_orig[i] = Blast_KarlinBlkNew();
01626 if (searchParams->kbp_gap_orig[i] == NULL) {
01627 return -1;
01628 }
01629 Blast_KarlinBlkCopy(searchParams->kbp_gap_orig[i],
01630 sbp->kbp_gap[i]);
01631 }
01632 }
01633
01634 if (compo_adjust_mode != eNoCompositionBasedStats) {
01635 Int4 **matrix;
01636 int j;
01637 int rows;
01638 if (positionBased) {
01639 matrix = sbp->psi_matrix->pssm->data;
01640 rows = query_length;
01641 } else {
01642 matrix = sbp->matrix->data;
01643 rows = BLASTAA_SIZE;
01644 }
01645 for (i = 0; i < rows; i++) {
01646 for (j = 0; j < BLASTAA_SIZE; j++) {
01647 searchParams->origMatrix[i][j] = matrix[i][j];
01648 }
01649 }
01650 }
01651 return 0;
01652 }
01653
01654
01655
01656
01657
01658
01659
01660
01661
01662
01663
01664 static void
01665 s_RescaleSearch(BlastScoreBlk* sbp,
01666 BlastScoringParameters* sp,
01667 int num_queries,
01668 double scale_factor)
01669 {
01670 int i;
01671 for (i = 0; i < num_queries; i++) {
01672 if (sbp->kbp_gap[i] != NULL) {
01673 Blast_KarlinBlk * kbp = sbp->kbp_gap[i];
01674 kbp->Lambda /= scale_factor;
01675 kbp->logK = log(kbp->K);
01676 }
01677 }
01678
01679 sp->gap_open = BLAST_Nint(sp->gap_open * scale_factor);
01680 sp->gap_extend = BLAST_Nint(sp->gap_extend * scale_factor);
01681 sp->scale_factor = scale_factor;
01682 }
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695 static void
01696 s_RestoreSearch(BlastScoreBlk* sbp,
01697 BlastScoringParameters* scoring,
01698 const BlastKappa_SavedParameters * searchParams,
01699 int query_length,
01700 Boolean positionBased,
01701 ECompoAdjustModes compo_adjust_mode)
01702 {
01703 int i;
01704
01705 scoring->gap_open = searchParams->gap_open;
01706 scoring->gap_extend = searchParams->gapExtend;
01707 scoring->scale_factor = searchParams->scale_factor;
01708
01709 for (i = 0; i < searchParams->num_queries; i++) {
01710 if (sbp->kbp_gap[i] != NULL) {
01711 Blast_KarlinBlkCopy(sbp->kbp_gap[i],
01712 searchParams->kbp_gap_orig[i]);
01713 }
01714 }
01715 if(compo_adjust_mode != eNoCompositionBasedStats) {
01716 int j;
01717 Int4 ** matrix;
01718 int rows;
01719
01720 if (positionBased) {
01721 matrix = sbp->psi_matrix->pssm->data;
01722 rows = query_length;
01723 } else {
01724 matrix = sbp->matrix->data;
01725 rows = BLASTAA_SIZE;
01726 }
01727 for (i = 0; i < rows; i++) {
01728 for (j = 0; j < BLASTAA_SIZE; j++) {
01729 matrix[i][j] = searchParams->origMatrix[i][j];
01730 }
01731 }
01732 }
01733 }
01734
01735
01736
01737
01738
01739
01740
01741
01742
01743
01744
01745
01746 static int
01747 s_MatrixInfoInit(Blast_MatrixInfo * self,
01748 BLAST_SequenceBlk* queryBlk,
01749 BlastScoreBlk* sbp,
01750 double scale_factor,
01751 const char * matrixName)
01752 {
01753 int status = 0;
01754 int lenName;
01755
01756
01757 lenName = strlen(matrixName);
01758 if (NULL == (self->matrixName = malloc(lenName + 1))) {
01759 return -1;
01760 }
01761 memcpy(self->matrixName, matrixName, lenName + 1);
01762
01763 if (self->positionBased) {
01764 status = s_GetPosBasedStartFreqRatios(self->startFreqRatios,
01765 queryBlk->length,
01766 queryBlk->sequence,
01767 matrixName,
01768 sbp->psi_matrix->freq_ratios);
01769 if (status == 0) {
01770 status = s_ScalePosMatrix(self->startMatrix, matrixName,
01771 sbp->psi_matrix->freq_ratios,
01772 queryBlk->sequence,
01773 queryBlk->length, sbp, scale_factor);
01774 self->ungappedLambda = sbp->kbp_psi[0]->Lambda / scale_factor;
01775 }
01776 } else {
01777 self->ungappedLambda = sbp->kbp_ideal->Lambda / scale_factor;
01778 status = s_GetStartFreqRatios(self->startFreqRatios, matrixName);
01779 if (status == 0) {
01780 Blast_Int4MatrixFromFreq(self->startMatrix, self->cols,
01781 self->startFreqRatios,
01782 self->ungappedLambda);
01783 }
01784 }
01785 return status;
01786 }
01787
01788
01789
01790
01791
01792
01793
01794
01795
01796
01797
01798
01799 static BlastCompo_QueryInfo *
01800 s_GetQueryInfo(Uint1 * query_data, BlastQueryInfo * blast_query_info)
01801 {
01802 int i;
01803 BlastCompo_QueryInfo *
01804 compo_query_info;
01805 int num_queries;
01806
01807
01808 num_queries = blast_query_info->num_queries;
01809 compo_query_info = calloc(num_queries, sizeof(BlastCompo_QueryInfo));
01810 if (compo_query_info != NULL) {
01811 for (i = 0; i < num_queries; i++) {
01812 BlastCompo_QueryInfo * query_info = &compo_query_info[i];
01813 BlastContextInfo * query_context = &blast_query_info->contexts[i];
01814
01815 query_info->eff_search_space =
01816 (double) query_context->eff_searchsp;
01817 query_info->origin = query_context->query_offset;
01818 query_info->seq.data = &query_data[query_info->origin];
01819 query_info->seq.length = query_context->query_length;
01820
01821 Blast_ReadAaComposition(&query_info->composition, BLASTAA_SIZE,
01822 query_info->seq.data,
01823 query_info->seq.length);
01824 }
01825 }
01826 return compo_query_info;
01827 }
01828
01829
01830
01831
01832
01833
01834
01835
01836
01837
01838
01839
01840 static BlastCompo_GappingParams *
01841 s_GappingParamsNew(BlastKappa_GappingParamsContext * context,
01842 const BlastExtensionParameters* extendParams,
01843 int num_queries)
01844 {
01845 int i;
01846 double min_lambda = DBL_MAX;
01847 const BlastScoringParameters * scoring = context->scoringParams;
01848 const BlastExtensionOptions * options = extendParams->options;
01849
01850 BlastCompo_GappingParams * gapping_params = NULL;
01851
01852 gapping_params = malloc(sizeof(BlastCompo_GappingParams));
01853 if (gapping_params != NULL) {
01854 gapping_params->gap_open = scoring->gap_open;
01855 gapping_params->gap_extend = scoring->gap_extend;
01856 gapping_params->context = context;
01857 }
01858
01859 for (i = 0; i < num_queries; i++) {
01860 if (context->sbp->kbp_gap[i] != NULL &&
01861 context->sbp->kbp_gap[i]->Lambda < min_lambda) {
01862 min_lambda = context->sbp->kbp_gap[i]->Lambda;
01863 }
01864 }
01865 gapping_params->x_dropoff = (Int4)
01866 MAX(options->gap_x_dropoff_final*NCBIMATH_LN2 / min_lambda,
01867 extendParams->gap_x_dropoff_final);
01868 context->gap_align->gap_x_dropoff = gapping_params->x_dropoff;
01869
01870 return gapping_params;
01871 }
01872
01873
01874
01875 static const Blast_RedoAlignCallbacks
01876 redo_align_callbacks = {
01877 s_CalcLambda, s_SequenceGetRange, s_RedoOneAlignment,
01878 s_NewAlignmentUsingXdrop, s_FreeEditScript
01879 };
01880
01881
01882
01883
01884
01885
01886
01887 static Blast_RedoAlignParams *
01888 s_GetAlignParams(BlastKappa_GappingParamsContext * context,
01889 BLAST_SequenceBlk * queryBlk,
01890 BlastQueryInfo* queryInfo,
01891 const BlastHitSavingParameters* hitParams,
01892 const BlastExtensionParameters* extendParams)
01893 {
01894 int status = 0;
01895 int rows;
01896 int cutoff_s;
01897 double cutoff_e;
01898 BlastCompo_GappingParams *
01899 gapping_params = NULL;
01900
01901 Blast_MatrixInfo *
01902 scaledMatrixInfo;
01903
01904 int subject_is_translated = context->prog_number == eBlastTypeTblastn;
01905
01906 Boolean positionBased = (Boolean) (context->sbp->psi_matrix != NULL);
01907
01908 Boolean do_link_hsps = (hitParams->do_sum_stats);
01909 ECompoAdjustModes compo_adjust_mode =
01910 (ECompoAdjustModes) extendParams->options->compositionBasedStats;
01911
01912 if (do_link_hsps) {
01913 ASSERT(hitParams->link_hsp_params != NULL);
01914 cutoff_s =
01915 (int) (hitParams->cutoff_score_min * context->localScalingFactor);
01916 } else {
01917
01918 cutoff_s = 1;
01919 }
01920 cutoff_e = hitParams->options->expect_value;
01921 rows = positionBased ? queryInfo->max_length : BLASTAA_SIZE;
01922 scaledMatrixInfo = Blast_MatrixInfoNew(rows, BLASTAA_SIZE, positionBased);
01923 status = s_MatrixInfoInit(scaledMatrixInfo, queryBlk, context->sbp,
01924 context->localScalingFactor,
01925 context->scoringParams->options->matrix);
01926 if (status != 0) {
01927 return NULL;
01928 }
01929 gapping_params = s_GappingParamsNew(context, extendParams,
01930 queryInfo->num_queries);
01931 if (gapping_params == NULL) {
01932 return NULL;
01933 } else {
01934 return
01935 Blast_RedoAlignParamsNew(&scaledMatrixInfo, &gapping_params,
01936 compo_adjust_mode, positionBased,
01937 subject_is_translated,
01938 queryInfo->max_length, cutoff_s, cutoff_e,
01939 do_link_hsps, &redo_align_callbacks);
01940 }
01941 }
01942
01943
01944
01945
01946
01947
01948
01949
01950
01951
01952 static void
01953 s_FillResultsFromCompoHeaps(BlastHSPResults * results,
01954 BlastCompo_Heap heaps[],
01955 Int4 hitlist_size)
01956 {
01957 int query_index;
01958 int num_queries;
01959
01960 num_queries = results->num_queries;
01961 for (query_index = 0; query_index < num_queries; query_index++) {
01962 BlastHSPList* hsp_list;
01963 BlastHitList* hitlist;
01964 BlastCompo_Heap * heap = &heaps[query_index];
01965
01966 results->hitlist_array[query_index] = Blast_HitListNew(hitlist_size);
01967 hitlist = results->hitlist_array[query_index];
01968
01969 while (NULL != (hsp_list = BlastCompo_HeapPop(heap))) {
01970 Blast_HitListUpdate(hitlist, hsp_list);
01971 }
01972 }
01973 Blast_HSPResultsReverseOrder(results);
01974 }
01975
01976
01977
01978 static void s_ClearHeap(BlastCompo_Heap * self)
01979 {
01980 BlastHSPList* hsp_list = NULL;
01981
01982 while (NULL != (hsp_list = BlastCompo_HeapPop(self))) {
01983 hsp_list = Blast_HSPListFree(hsp_list);
01984 }
01985 }
01986
01987
01988
01989
01990
01991
01992 Int2
01993 Blast_RedoAlignmentCore(EBlastProgramType program_number,
01994 BLAST_SequenceBlk * queryBlk,
01995 BlastQueryInfo* queryInfo,
01996 BlastScoreBlk* sbp,
01997 BlastHSPStream* hsp_stream,
01998 const BlastSeqSrc* seqSrc,
01999 Int4 default_db_genetic_code,
02000 BlastScoringParameters* scoringParams,
02001 const BlastExtensionParameters* extendParams,
02002 const BlastHitSavingParameters* hitParams,
02003 const PSIBlastOptions* psiOptions,
02004 BlastHSPResults* results)
02005 {
02006 int status_code = 0;
02007
02008
02009 double localScalingFactor;
02010
02011
02012
02013 BlastKappa_SavedParameters *savedParams = NULL;
02014
02015
02016 Blast_ForbiddenRanges forbidden = {0,};
02017
02018
02019 BlastCompo_Heap * redoneMatches = NULL;
02020
02021
02022 Blast_CompositionWorkspace *NRrecord = NULL;
02023
02024 int query_index;
02025
02026 int numQueries = queryInfo->num_queries;
02027
02028 BlastGapAlignStruct* gapAlign = NULL;
02029
02030
02031 double inclusion_ethresh;
02032
02033 BlastCompo_Alignment ** alignments = NULL;
02034
02035 BlastCompo_QueryInfo * query_info = NULL;
02036 Blast_RedoAlignParams * redo_align_params = NULL;
02037 Boolean positionBased = (Boolean) (sbp->psi_matrix != NULL);
02038 ECompoAdjustModes compo_adjust_mode =
02039 (ECompoAdjustModes) extendParams->options->compositionBasedStats;
02040 Boolean smithWaterman =
02041 (Boolean) (extendParams->options->eTbackExt == eSmithWatermanTbck);
02042
02043 BlastHSPList* thisMatch = NULL;
02044
02045 BlastCompo_Alignment * incoming_aligns = NULL;
02046 Int4 **matrix;
02047 BlastKappa_GappingParamsContext gapping_params_context;
02048
02049 double pvalueForThisPair = (-1);
02050
02051 double LambdaRatio;
02052
02053
02054 int compositionTestIndex = extendParams->options->unifiedP;
02055 Uint1* genetic_code_string = GenCodeSingletonFind(default_db_genetic_code);
02056
02057 ASSERT(program_number == eBlastTypeBlastp ||
02058 program_number == eBlastTypeTblastn ||
02059 program_number == eBlastTypePsiBlast);
02060
02061 if (positionBased) {
02062 matrix = sbp->psi_matrix->pssm->data;
02063 } else {
02064 matrix = sbp->matrix->data;
02065 }
02066
02067 if (matrix == NULL) {
02068 return -1;
02069 }
02070 if (0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20") &&
02071 compo_adjust_mode == eNoCompositionBasedStats) {
02072 return -1;
02073
02074 }
02075 if (positionBased) {
02076
02077
02078 if ((int) compo_adjust_mode > 1) {
02079 compo_adjust_mode = eCompositionBasedStats;
02080 }
02081
02082 ASSERT(queryInfo->num_queries == 1);
02083 ASSERT(queryBlk->length == (Int4)sbp->psi_matrix->pssm->ncols);
02084 }
02085 if ((int) compo_adjust_mode > 1 &&
02086 !Blast_FrequencyDataIsAvailable(scoringParams->options->matrix)) {
02087 return -1;
02088 }
02089
02090 inclusion_ethresh = (psiOptions
02091 ? psiOptions->inclusion_ethresh
02092 : PSI_INCLUSION_ETHRESH);
02093 ASSERT(inclusion_ethresh != 0.0);
02094
02095
02096 savedParams =
02097 s_SavedParametersNew(queryInfo->max_length, queryInfo->num_queries,
02098 compo_adjust_mode, positionBased);
02099 if (savedParams == NULL) {
02100 status_code = -1;
02101 goto function_cleanup;
02102 }
02103 status_code =
02104 s_RecordInitialSearch(savedParams, sbp, scoringParams,
02105 queryInfo->max_length, compo_adjust_mode,
02106 positionBased);
02107 if (status_code != 0) {
02108 goto function_cleanup;
02109 }
02110 if (compo_adjust_mode != eNoCompositionBasedStats) {
02111 if((0 == strcmp(scoringParams->options->matrix, "BLOSUM62_20"))) {
02112 localScalingFactor = SCALING_FACTOR / 10;
02113 } else {
02114 localScalingFactor = SCALING_FACTOR;
02115 }
02116 } else {
02117 localScalingFactor = 1.0;
02118 }
02119 s_RescaleSearch(sbp, scoringParams, queryInfo->num_queries,
02120 localScalingFactor);
02121 status_code =
02122 BLAST_GapAlignStructNew(scoringParams, extendParams,
02123 BlastSeqSrcGetMaxSeqLen(seqSrc), sbp,
02124 &gapAlign);
02125 if (status_code != 0) {
02126 return (Int2) status_code;
02127 }
02128 gapping_params_context.gap_align = gapAlign;
02129 gapping_params_context.scoringParams = scoringParams;
02130 gapping_params_context.sbp = sbp;
02131 gapping_params_context.localScalingFactor = localScalingFactor;
02132 gapping_params_context.prog_number = program_number;
02133 redo_align_params =
02134 s_GetAlignParams(&gapping_params_context, queryBlk, queryInfo,
02135 hitParams, extendParams);
02136 if (redo_align_params == NULL) {
02137 status_code = -1;
02138 goto function_cleanup;
02139 }
02140 query_info = s_GetQueryInfo(queryBlk->sequence, queryInfo);
02141 if (query_info == NULL) {
02142 status_code = -1;
02143 goto function_cleanup;
02144 }
02145 if(smithWaterman) {
02146 status_code =
02147 Blast_ForbiddenRangesInitialize(&forbidden, queryInfo->max_length);
02148 if (status_code != 0) {
02149 goto function_cleanup;
02150 }
02151 }
02152 redoneMatches = calloc(numQueries, sizeof(BlastCompo_Heap));
02153 if (redoneMatches == NULL) {
02154 status_code = -1;
02155 goto function_cleanup;
02156 }
02157 for (query_index = 0; query_index < numQueries; query_index++) {
02158 status_code =
02159 BlastCompo_HeapInitialize(&redoneMatches[query_index],
02160 hitParams->options->hitlist_size,
02161 inclusion_ethresh);
02162 if (status_code != 0) {
02163 goto function_cleanup;
02164 }
02165 }
02166 if( (int) compo_adjust_mode > 1 && !positionBased ) {
02167 NRrecord = Blast_CompositionWorkspaceNew();
02168 status_code =
02169 Blast_CompositionWorkspaceInit(NRrecord,
02170 scoringParams->options->matrix);
02171 if (status_code != 0) {
02172 goto function_cleanup;
02173 }
02174 }
02175 alignments = calloc(numQueries, sizeof(BlastCompo_Alignment *));
02176 if (alignments == NULL) {
02177 status_code = -1;
02178 goto function_cleanup;
02179 }
02180 while (BlastHSPStreamRead(hsp_stream, &thisMatch) != kBlastHSPStream_Eof) {
02181
02182 Blast_KarlinBlk * kbp;
02183
02184
02185 BlastCompo_MatchingSequence matchingSeq = {0,};
02186 if(thisMatch->hsp_array == NULL) {
02187 continue;
02188 }
02189 if (BlastCompo_EarlyTermination(thisMatch->best_evalue,
02190 redoneMatches, numQueries)) {
02191 break;
02192 }
02193
02194 status_code =
02195 s_MatchingSequenceInitialize(&matchingSeq, program_number,
02196 seqSrc, default_db_genetic_code,
02197 thisMatch->oid);
02198 if (status_code != 0) {
02199 goto match_loop_cleanup;
02200 }
02201 incoming_aligns =
02202 s_ResultHspToDistinctAlign(thisMatch->hsp_array, thisMatch->hspcnt,
02203 queryInfo, localScalingFactor);
02204 if (incoming_aligns == NULL) {
02205 status_code = -1;
02206 goto match_loop_cleanup;
02207 }
02208
02209 kbp = sbp->kbp_gap[thisMatch->query_index];
02210 if (smithWaterman) {
02211 status_code =
02212 Blast_RedoOneMatchSmithWaterman(alignments,
02213 redo_align_params,
02214 incoming_aligns,
02215 thisMatch->hspcnt,
02216 kbp->Lambda, kbp->logK,
02217 &matchingSeq, query_info,
02218 numQueries,
02219 matrix, BLASTAA_SIZE,
02220 NRrecord, &forbidden,
02221 redoneMatches,
02222 &pvalueForThisPair,
02223 compositionTestIndex,
02224 &LambdaRatio);
02225 } else {
02226 status_code =
02227 Blast_RedoOneMatch(alignments, redo_align_params,
02228 incoming_aligns, thisMatch->hspcnt,
02229 kbp->Lambda, &matchingSeq,
02230 queryInfo->max_length, query_info,
02231 numQueries, matrix, BLASTAA_SIZE,
02232 NRrecord, &pvalueForThisPair,
02233 compositionTestIndex,
02234 &LambdaRatio);
02235 }
02236 if (status_code != 0) {
02237 goto match_loop_cleanup;
02238 }
02239 for (query_index = 0; query_index < numQueries; query_index++) {
02240
02241 if( alignments[query_index] != NULL) {
02242 double bestEvalue;
02243
02244 Int4 bestScore;
02245
02246
02247 BlastHSPList * hsp_list = NULL;
02248 void * discardedAligns = NULL;
02249 hsp_list =
02250 s_HSPListFromDistinctAlignments(&alignments[query_index],
02251 matchingSeq.index,
02252 queryInfo);
02253 if (hsp_list == NULL) {
02254 status_code = -1;
02255 goto match_loop_cleanup;
02256 }
02257 if (hsp_list->hspcnt > 1) {
02258 s_HitlistReapContained(hsp_list->hsp_array,
02259 &hsp_list->hspcnt);
02260 }
02261 status_code =
02262 s_HitlistEvaluateAndPurge(&bestScore, &bestEvalue,
02263 hsp_list,
02264 seqSrc,
02265 matchingSeq.length,
02266 program_number,
02267 queryInfo, query_index,
02268 sbp, hitParams,
02269 pvalueForThisPair, LambdaRatio,
02270 matchingSeq.index);
02271 if (status_code != 0) {
02272 goto query_loop_cleanup;
02273 }
02274 if (bestEvalue <= hitParams->options->expect_value &&
02275 BlastCompo_HeapWouldInsert(&redoneMatches[query_index],
02276 bestEvalue, bestScore,
02277 thisMatch->oid)) {
02278
02279 s_HSPListNormalizeScores(hsp_list,
02280 kbp->Lambda, kbp->logK,
02281 localScalingFactor);
02282 s_ComputeNumIdentities(queryBlk, queryInfo, seqSrc,
02283 hsp_list, scoringParams->options,
02284 genetic_code_string);
02285 status_code =
02286 BlastCompo_HeapInsert(&redoneMatches[query_index],
02287 hsp_list, bestEvalue,
02288 bestScore, thisMatch->oid,
02289 &discardedAligns);
02290 if (status_code == 0) {
02291 hsp_list = NULL;
02292 } else {
02293 goto query_loop_cleanup;
02294 }
02295 if (discardedAligns != NULL) {
02296 Blast_HSPListFree(discardedAligns);
02297 }
02298 }
02299 query_loop_cleanup:
02300 Blast_HSPListFree(hsp_list);
02301 if (status_code != 0) {
02302 goto match_loop_cleanup;
02303 }
02304 }
02305 }
02306 match_loop_cleanup:
02307 if (status_code != 0) {
02308 for (query_index = 0; query_index < numQueries; query_index++) {
02309 BlastCompo_AlignmentsFree(&alignments[query_index],
02310 s_FreeEditScript);
02311 }
02312 }
02313 s_MatchingSequenceRelease(&matchingSeq);
02314 thisMatch = Blast_HSPListFree(thisMatch);
02315 BlastCompo_AlignmentsFree(&incoming_aligns, NULL);
02316 if (status_code != 0) {
02317 goto function_cleanup;
02318 }
02319 }
02320
02321 function_cleanup:
02322 sfree(alignments);
02323 if (status_code == 0) {
02324 s_FillResultsFromCompoHeaps(results, redoneMatches,
02325 hitParams->options->hitlist_size);
02326 } else {
02327 if (redoneMatches != NULL) {
02328 s_ClearHeap(&redoneMatches[0]);
02329 }
02330 }
02331 free(query_info);
02332 Blast_RedoAlignParamsFree(&redo_align_params);
02333 if (redoneMatches != NULL) {
02334 for (query_index = 0; query_index < numQueries; query_index++) {
02335 BlastCompo_HeapRelease(&redoneMatches[query_index]);
02336 }
02337 sfree(redoneMatches); redoneMatches = NULL;
02338 }
02339 if (smithWaterman) {
02340 Blast_ForbiddenRangesRelease(&forbidden);
02341 }
02342 if (gapAlign != NULL) {
02343 gapAlign = BLAST_GapAlignStructFree(gapAlign);
02344 }
02345 s_RestoreSearch(sbp, scoringParams, savedParams, queryBlk->length,
02346 positionBased, compo_adjust_mode);
02347 s_SavedParametersFree(&savedParams);
02348 Blast_CompositionWorkspaceFree(&NRrecord);
02349
02350 return (Int2) status_code;
02351 }
02352
02353