00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034 #include <ncbi_pch.hpp>
00035 #include <objmgr/seq_map_switch.hpp>
00036 #include <objmgr/seq_map.hpp>
00037 #include <objmgr/seq_map_ci.hpp>
00038 #include <objmgr/bioseq_handle.hpp>
00039 #include <objmgr/impl/seq_align_mapper.hpp>
00040 #include <objmgr/seq_loc_mapper.hpp>
00041 #include <objects/seq/Seq_hist.hpp>
00042 #include <objects/seqalign/Seq_align.hpp>
00043 #include <algorithm>
00044
00045 BEGIN_NCBI_SCOPE
00046 BEGIN_SCOPE(objects)
00047
00048 class CScope;
00049
00050 namespace {
00051
00052 struct SSeqPos;
00053
00054 struct SSeqPos
00055 {
00056 CSeq_id_Handle id;
00057 TSeqPos pos;
00058 bool minus_strand;
00059
00060 enum ESeqMapSegmentEdge
00061 {
00062 eStart,
00063 eEnd
00064 };
00065
00066 SSeqPos(const CSeqMap_CI& iter, ESeqMapSegmentEdge edge)
00067 : id(iter.GetRefSeqid()),
00068 minus_strand(iter.GetRefMinusStrand())
00069 {
00070 if ( edge == eStart ) {
00071 if ( !minus_strand ) {
00072 pos = iter.GetRefPosition();
00073 }
00074 else {
00075 pos = iter.GetRefEndPosition() - 1;
00076 }
00077 }
00078 else {
00079 if ( !minus_strand ) {
00080 pos = iter.GetRefEndPosition();
00081 }
00082 else {
00083 pos = iter.GetRefPosition() - 1;
00084 }
00085 }
00086 }
00087
00088 SSeqPos& operator+=(int diff)
00089 {
00090 pos += minus_strand? -diff: diff;
00091 return *this;
00092 }
00093 SSeqPos& operator-=(int diff)
00094 {
00095 pos += minus_strand? diff: -diff;
00096 return *this;
00097 }
00098
00099 void Reverse(void)
00100 {
00101 *this -= 1;
00102 minus_strand = !minus_strand;
00103 }
00104 };
00105 CNcbiOstream& operator<<(CNcbiOstream& out, const SSeqPos& pos)
00106 {
00107 return out << pos.id.AsString() << " @ "
00108 << pos.pos << (pos.minus_strand? " minus": " plus");
00109 }
00110
00111 struct SSeq_align_Info
00112 {
00113 CBioseq_Handle m_Master;
00114 set<CSeq_id_Handle> m_SegmentIds;
00115
00116 SSeq_align_Info(const CBioseq_Handle& master)
00117 {
00118 x_Init(master);
00119 }
00120 SSeq_align_Info(const CBioseq_Handle& master, const CSeq_align& align)
00121 {
00122 x_Init(master);
00123 Add(align);
00124 }
00125
00126 void x_Init(const CBioseq_Handle& master)
00127 {
00128 m_Master = master;
00129 for ( CSeqMap_CI seg_it =
00130 master.GetSeqMap().begin(&master.GetScope());
00131 seg_it; ++seg_it ) {
00132 if ( seg_it.GetType() == CSeqMap::eSeqRef ) {
00133 m_SegmentIds.insert(seg_it.GetRefSeqid());
00134 }
00135 }
00136 }
00137
00138 void Add(const CSeq_align& align)
00139 {
00140 SMatch match;
00141 match.align.Reset(&align);
00142 CSeq_loc_Mapper loc_mapper(new CMappingRanges,
00143 &m_Master.GetScope());
00144 CSeq_align_Mapper mapper(align, loc_mapper);
00145 ITERATE ( CSeq_align_Mapper::TSegments, s, mapper.GetSegments() ) {
00146 TSeqPos len = s->m_Len;
00147 ITERATE ( SAlignment_Segment::TRows, r1, s->m_Rows ) {
00148 if ( r1->m_Start == kInvalidSeqPos ||
00149 m_SegmentIds.find(r1->m_Id) == m_SegmentIds.end() ) {
00150 continue;
00151 }
00152 match.id1 = r1->m_Id;
00153 match.range1.SetFrom(r1->m_Start);
00154 match.range1.SetLength(len);
00155 ITERATE ( SAlignment_Segment::TRows, r2, s->m_Rows ) {
00156 if ( r2 == r1 ) {
00157 break;
00158 }
00159 if ( r2->m_Start == kInvalidSeqPos ||
00160 m_SegmentIds.find(r2->m_Id)==m_SegmentIds.end() ){
00161 continue;
00162 }
00163 match.id2 = r2->m_Id;
00164 match.range2.SetFrom(r2->m_Start);
00165 match.range2.SetLength(s->m_Len);
00166 match.same_strand = r1->SameStrand(*r2);
00167 GetMatches(match.id1, match.id2).push_back(match);
00168 }
00169 }
00170 }
00171 }
00172
00173 struct SMatch
00174 {
00175 CConstRef<CSeq_align> align;
00176 CSeq_id_Handle id1;
00177 CRange<TSeqPos> range1;
00178 CSeq_id_Handle id2;
00179 CRange<TSeqPos> range2;
00180 bool same_strand;
00181
00182 static bool Contains(const CRange<TSeqPos>& range, TSeqPos pos)
00183 {
00184 return pos >= range.GetFrom() && pos <= range.GetTo();
00185 }
00186 static TSeqPos GetAdd(const CRange<TSeqPos>& range, const SSeqPos& pos)
00187 {
00188 _ASSERT(Contains(range, pos.pos));
00189 if ( pos.minus_strand ) {
00190 return pos.pos - range.GetFrom() + 1;
00191 }
00192 else {
00193 return range.GetTo() - pos.pos + 1;
00194 }
00195 }
00196
00197 struct SMatchInfo
00198 {
00199 SMatchInfo(void)
00200 : skip(true), m1(kInvalidSeqPos), m2(kInvalidSeqPos)
00201 {
00202 }
00203 CConstRef<CSeq_align> align;
00204 bool skip;
00205 TSeqPos m1, m2;
00206 bool operator!(void) const
00207 {
00208 return skip &&
00209 (m1 == kInvalidSeqPos || m2 == kInvalidSeqPos);
00210 }
00211 bool operator<(const SMatchInfo& m) const
00212 {
00213 if ( skip != m.skip )
00214 return m.skip;
00215 return m1+m2 < m.m1+m.m2;
00216 }
00217 };
00218
00219 typedef SMatchInfo TMatch;
00220
00221
00222 static CRange<int> GetMatchPos(const CRange<TSeqPos>& range,
00223 const SSeqPos& pos)
00224 {
00225 CRange<int> ret;
00226 if ( pos.minus_strand ) {
00227 ret.SetFrom(pos.pos - range.GetTo());
00228 }
00229 else {
00230 ret.SetFrom(range.GetFrom() - pos.pos);
00231 }
00232 ret.SetLength(range.GetLength());
00233 return ret;
00234 }
00235 TMatch GetMatchOrdered(const SSeqPos& pos1, const SSeqPos& pos2) const
00236 {
00237 TMatch ret;
00238 if ( same_strand != (pos1.minus_strand==pos2.minus_strand) ) {
00239 return ret;
00240 }
00241 CRange<int> m1 = GetMatchPos(range1, pos1);
00242 CRange<int> m2 = GetMatchPos(range2, pos2);
00243
00244
00245 if ( m1.GetTo() < 0 || m2.GetTo() < 0 ) {
00246 return ret;
00247 }
00248 int l1 = m1.GetTo()-max(0, m1.GetFrom());
00249 int l2 = m2.GetTo()-max(0, m2.GetFrom());
00250 if ( l1 != l2 ) {
00251 return ret;
00252 }
00253 ret.align = align;
00254 if ( m1.GetFrom() <= 0 && m2.GetFrom() <= 0 ) {
00255 ret.skip = false;
00256 ret.m1 = ret.m2 = m1.GetTo()+1;
00257 }
00258 else {
00259 ret.m1 = m1.GetFrom();
00260 ret.m2 = m2.GetFrom();
00261 }
00262 return ret;
00263 }
00264
00265 TMatch GetMatch(const SSeqPos& pos1, const SSeqPos& pos2) const
00266 {
00267 if ( pos1.id == id1 && pos2.id == id2 ) {
00268 return GetMatchOrdered(pos1, pos2);
00269 }
00270 if ( pos2.id == id1 && pos1.id == id2 ) {
00271 TMatch ret = GetMatchOrdered(pos2, pos1);
00272 swap(ret.m1, ret.m2);
00273 return ret;
00274 }
00275 return TMatch();
00276 }
00277 };
00278 typedef vector<SMatch> TMatches;
00279 typedef pair<CSeq_id_Handle, CSeq_id_Handle> TKey;
00280 typedef map<TKey, TMatches> TMatchMap;
00281
00282 static TKey GetKey(const CSeq_id_Handle& id1, const CSeq_id_Handle& id2)
00283 {
00284 TKey key;
00285 if ( id1 < id2 ) {
00286 key.first = id1;
00287 key.second = id2;
00288 }
00289 else {
00290 key.first = id2;
00291 key.second = id1;
00292 }
00293 return key;
00294 }
00295 TMatches& GetMatches(const CSeq_id_Handle& id1, const CSeq_id_Handle& id2)
00296 {
00297 return match_map[GetKey(id1, id2)];
00298 }
00299
00300 TMatchMap match_map;
00301
00302 typedef CSeqMapSwitchPoint::TDifferences TDifferences;
00303
00304 pair<TSeqPos, TSeqPos>
00305 x_FindAlignMatch(SSeqPos pos1,
00306 SSeqPos pos2,
00307 TSeqPos limit,
00308 TDifferences& diff,
00309 CConstRef<CSeq_align>& first_align) const;
00310 };
00311
00312 pair<TSeqPos, TSeqPos>
00313 SSeq_align_Info::x_FindAlignMatch(SSeqPos pos1,
00314 SSeqPos pos2,
00315 TSeqPos limit,
00316 TDifferences& diff,
00317 CConstRef<CSeq_align>& first_align) const
00318 {
00319 pair<TSeqPos, TSeqPos> ret(0, 0);
00320 bool exact = true;
00321 TSeqPos skip1 = 0, skip2 = 0, offset = 0, skip_pos = kInvalidSeqPos;
00322 while ( limit ) {
00323 _TRACE("pos1="<<pos1<<" pos2="<<pos2);
00324 SMatch::TMatch add;
00325 TMatchMap::const_iterator miter =
00326 match_map.find(GetKey(pos1.id, pos2.id));
00327 if ( miter != match_map.end() ) {
00328 const TMatches& matches = miter->second;
00329 ITERATE ( TMatches, it, matches ) {
00330 SMatch::TMatch m = it->GetMatch(pos1, pos2);
00331 if ( m < add ) {
00332 add = m;
00333 }
00334 }
00335 }
00336 _TRACE("pos1="<<pos1<<" pos2="<<pos2<<" add="<<add.m1<<','<<add.m2);
00337 if ( !add ) {
00338 break;
00339 }
00340 if ( !first_align ) {
00341 first_align = add.align;
00342 }
00343 if ( add.skip ) {
00344 if ( skip1 == 0 ) {
00345 skip_pos = offset;
00346 }
00347 TSeqPos len = min(limit, add.m1);
00348 skip1 += add.m1;
00349 skip2 += add.m2;
00350 pos1 += add.m1;
00351 pos2 += add.m2;
00352 limit -= len;
00353 offset += len;
00354 exact = false;
00355 }
00356 else {
00357 if ( skip1 || skip2 ) {
00358 diff[skip_pos].second += skip1;
00359 diff[skip_pos].first += skip2;
00360 ret.first += skip1;
00361 skip1 = 0;
00362 skip2 = 0;
00363 skip_pos = kInvalidSeqPos;
00364 }
00365 _ASSERT(add.m1 == add.m2);
00366 TSeqPos len = min(limit, add.m1);
00367 ret.first += len;
00368 if ( exact ) {
00369 ret.second = ret.first;
00370 }
00371 pos1 += len;
00372 pos2 += len;
00373 limit -= len;
00374 offset += len;
00375 }
00376 }
00377 ITERATE ( TDifferences, i, diff ) {
00378 _TRACE("pos: "<<i->first<<" ins: "<<i->second.first<<" del: "<<i->second.second);
00379 }
00380 return ret;
00381 }
00382
00383 CRef<CSeqMapSwitchPoint> x_GetSwitchPoint(const CBioseq_Handle& seq,
00384 SSeq_align_Info& info,
00385 const CSeqMap_CI& iter1,
00386 const CSeqMap_CI& iter2)
00387 {
00388 CRef<CSeqMapSwitchPoint> sp_ref(new CSeqMapSwitchPoint);
00389 CSeqMapSwitchPoint& sp = *sp_ref;
00390 sp.m_Master = seq;
00391 TSeqPos pos = iter2.GetPosition();
00392 _ASSERT(pos == iter1.GetEndPosition());
00393 sp.m_MasterPos = pos;
00394
00395 SSeqPos pos1(iter1, SSeqPos::eEnd);
00396 SSeqPos pos2(iter2, SSeqPos::eStart);
00397
00398 sp.m_LeftId = iter1.GetRefSeqid();
00399 sp.m_LeftMinusStrand = iter1.GetRefMinusStrand();
00400 sp.m_LeftPos = pos1.pos;
00401
00402 sp.m_RightId = iter2.GetRefSeqid();
00403 sp.m_RightMinusStrand = iter2.GetRefMinusStrand();
00404 sp.m_RightPos = pos2.pos;
00405
00406 pair<TSeqPos, TSeqPos> ext2 =
00407 info.x_FindAlignMatch(pos2, pos1, iter2.GetLength(),
00408 sp.m_RightDifferences, sp.m_FirstAlign);
00409 pos1.Reverse();
00410 pos2.Reverse();
00411 pair<TSeqPos, TSeqPos> ext1 =
00412 info.x_FindAlignMatch(pos1, pos2, iter1.GetLength(),
00413 sp.m_LeftDifferences, sp.m_FirstAlign);
00414
00415 sp.m_MasterRange.SetFrom(pos-ext1.first).SetTo(pos+ext2.first);
00416 sp.m_ExactMasterRange.SetFrom(pos-ext1.second).SetTo(pos+ext2.second);
00417
00418 return sp_ref;
00419 }
00420
00421 CSeqMapSwitchPoint::TInsertDelete
00422 x_GetDifferences(const CSeqMapSwitchPoint::TDifferences& diff,
00423 TSeqPos offset, TSeqPos add)
00424 {
00425 CSeqMapSwitchPoint::TInsertDelete ret;
00426 CSeqMapSwitchPoint::TDifferences::const_iterator iter = diff.begin();
00427 while ( iter != diff.end() && offset >= iter->first ) {
00428 if ( offset - iter->first <= iter->second.second ) {
00429 TSeqPos ins = min(add, iter->second.first);
00430 ret.first += ins;
00431 TSeqPos del = offset - iter->first;
00432 ret.second += del;
00433 break;
00434 }
00435 else {
00436 ret.first += iter->second.first;
00437 ret.second += iter->second.second;
00438 }
00439 ++iter;
00440 }
00441 return ret;
00442 }
00443
00444 }
00445
00446 class CScope;
00447
00448 CSeqMapSwitchPoint::TInsertDelete
00449 CSeqMapSwitchPoint::GetDifferences(TSeqPos new_pos, TSeqPos add) const
00450 {
00451 if ( new_pos > m_MasterPos ) {
00452 return x_GetDifferences(m_RightDifferences, new_pos-m_MasterPos, add);
00453 }
00454 else if ( new_pos < m_MasterPos ) {
00455 return x_GetDifferences(m_LeftDifferences, m_MasterPos-new_pos, add);
00456 }
00457 else {
00458 return TInsertDelete();
00459 }
00460 }
00461
00462 TSeqPos CSeqMapSwitchPoint::GetInsert(TSeqPos pos) const
00463 {
00464 if ( !m_Master ) {
00465 NCBI_THROW(CObjMgrException, eInvalidHandle,
00466 "switch point is not initialized");
00467 }
00468 if ( pos < m_MasterRange.GetFrom() || pos > m_MasterRange.GetTo() ) {
00469 NCBI_THROW(CSeqMapException, eOutOfRange,
00470 "switch point is not in valid range");
00471 }
00472 const TDifferences* diff;
00473 TSeqPos offset;
00474 if ( pos < m_MasterPos ) {
00475 diff = &m_LeftDifferences;
00476 offset = m_MasterPos - pos;
00477 }
00478 else if ( pos > m_MasterPos ) {
00479 diff = &m_RightDifferences;
00480 offset = pos-m_MasterPos;
00481 }
00482 else {
00483 return 0;
00484 }
00485 TDifferences::const_iterator iter = diff->find(offset);
00486 return iter == diff->end()? 0: iter->second.first;
00487 }
00488
00489
00490 TSeqPos CSeqMapSwitchPoint::GetLeftInPlaceInsert(void) const
00491 {
00492 if ( !m_LeftDifferences.empty() &&
00493 m_LeftDifferences.begin()->first == 0 ) {
00494 return m_LeftDifferences.begin()->second.first;
00495 }
00496 return 0;
00497 }
00498
00499
00500 TSeqPos CSeqMapSwitchPoint::GetRightInPlaceInsert(void) const
00501 {
00502 if ( !m_RightDifferences.empty() &&
00503 m_RightDifferences.begin()->first == 0 ) {
00504 return m_RightDifferences.begin()->second.first;
00505 }
00506 return 0;
00507 }
00508
00509
00510 void CSeqMapSwitchPoint::ChangeSwitchPoint(TSeqPos pos, TSeqPos add)
00511 {
00512 if ( !m_Master ) {
00513 NCBI_THROW(CObjMgrException, eInvalidHandle,
00514 "switch point is not initialized");
00515 }
00516 if ( pos < m_MasterRange.GetFrom() || pos > m_MasterRange.GetTo() ) {
00517 NCBI_THROW(CSeqMapException, eOutOfRange,
00518 "switch point is not in valid range");
00519 }
00520 if ( add > 0 && add > GetInsert(pos) ) {
00521 NCBI_THROW(CSeqMapException, eOutOfRange,
00522 "adding more bases than available");
00523 }
00524 CSeqMap& seq_map = const_cast<CSeqMap&>(m_Master.GetSeqMap());
00525 CSeqMap_CI right = seq_map.FindSegment(m_MasterPos, &m_Master.GetScope());
00526 if ( right.GetPosition() != m_MasterPos ) {
00527 NCBI_THROW(CSeqMapException, eOutOfRange,
00528 "invalid CSeqMapSwitchPoint");
00529 }
00530 if ( right.GetType() != CSeqMap::eSeqRef ||
00531 right.GetRefSeqid() != m_RightId ||
00532 right.GetRefMinusStrand() != m_RightMinusStrand ||
00533 SSeqPos(right, SSeqPos::eStart).pos != m_RightPos ) {
00534 NCBI_THROW(CSeqMapException, eOutOfRange,
00535 "invalid CSeqMapSwitchPoint");
00536 }
00537 CSeqMap_CI left = right; --left;
00538 if ( left.GetType() != CSeqMap::eSeqRef ||
00539 left.GetRefSeqid() != m_LeftId ||
00540 left.GetRefMinusStrand() != m_LeftMinusStrand ||
00541 SSeqPos(left, SSeqPos::eEnd).pos != m_LeftPos ) {
00542 NCBI_THROW(CSeqMapException, eOutOfRange,
00543 "invalid CSeqMapSwitchPoint");
00544 }
00545 int left_add;
00546 int right_add;
00547 if ( pos < m_MasterPos ) {
00548 left_add = pos - m_MasterPos;
00549 right_add = -left_add + GetLengthDifference(pos) + add;
00550 _ASSERT(left_add < 0);
00551 _ASSERT(right_add > 0);
00552 }
00553 else if ( pos > m_MasterPos ) {
00554 right_add = m_MasterPos - pos;
00555 left_add = -right_add + GetLengthDifference(pos) + add;
00556 _ASSERT(right_add < 0);
00557 _ASSERT(left_add > 0);
00558 }
00559 else {
00560 left_add = 0;
00561 right_add = 0;
00562 }
00563 if ( right_add ) {
00564 TSeqPos len = right.GetLength() + right_add;
00565 TSeqPos pos = right.GetRefPosition();
00566 if ( !m_RightMinusStrand ) {
00567 pos -= right_add;
00568 }
00569 seq_map.SetSegmentRef(right, len, m_RightId, pos, m_RightMinusStrand);
00570 }
00571 if ( left_add ) {
00572 TSeqPos len = left.GetLength() + left_add;
00573 TSeqPos pos = left.GetRefPosition();
00574 if ( m_LeftMinusStrand ) {
00575 pos -= left_add;
00576 }
00577 seq_map.SetSegmentRef(left, len, m_LeftId, pos, m_LeftMinusStrand);
00578 }
00579 }
00580
00581
00582 void CSeqMapSwitchPoint::InsertInPlace(TSeqPos add_left, TSeqPos add_right)
00583 {
00584 if ( !m_Master ) {
00585 NCBI_THROW(CObjMgrException, eInvalidHandle,
00586 "switch point is not initialized");
00587 }
00588 if ( add_left && add_left > GetLeftInPlaceInsert() ||
00589 add_right && add_right > GetRightInPlaceInsert() ) {
00590 NCBI_THROW(CSeqMapException, eOutOfRange,
00591 "adding more bases than available");
00592 }
00593 }
00594
00595
00596
00597 CRef<CSeqMapSwitchPoint> GetSwitchPoint(const CBioseq_Handle& seq,
00598 const CSeq_align& align)
00599 {
00600 SSeq_align_Info info(seq, align);
00601 if ( info.match_map.size() != 1 ) {
00602 NCBI_THROW(CSeqMapException, eInvalidIndex,
00603 "Seq-align dimension is not 2");
00604 }
00605 CSeq_id_Handle id1 = info.match_map.begin()->first.first;
00606 CSeq_id_Handle id2 = info.match_map.begin()->first.second;
00607
00608 CSeqMap_CI iter1 = seq.GetSeqMap().begin(&seq.GetScope());
00609 if ( !iter1 ) {
00610 NCBI_THROW(CSeqMapException, eInvalidIndex,
00611 "Sequence is not segmented");
00612 }
00613 CSeqMap_CI iter2 = iter1;
00614 ++iter2;
00615
00616 for ( ; iter2; ++iter1, ++iter2 ) {
00617 if ( iter1.GetType() == CSeqMap::eSeqRef &&
00618 iter2.GetType() == CSeqMap::eSeqRef ) {
00619 if ( iter1.GetRefSeqid() == id1 && iter2.GetRefSeqid() == id2 ||
00620 iter1.GetRefSeqid() == id2 && iter2.GetRefSeqid() == id1 ) {
00621 return x_GetSwitchPoint(seq, info, iter1, iter2);
00622 }
00623 }
00624 }
00625
00626 NCBI_THROW(CSeqMapException, eInvalidIndex,
00627 "Seq-align doesn't refer to segments");
00628 }
00629
00630
00631 TSeqMapSwitchPoints GetAllSwitchPoints(const CBioseq_Handle& seq,
00632 const TSeqMapSwitchAligns& aligns)
00633 {
00634 TSeqMapSwitchPoints pp;
00635
00636 CSeqMap_CI iter1 = seq.GetSeqMap().begin(&seq.GetScope());
00637 if ( !iter1 ) {
00638 NCBI_THROW(CSeqMapException, eInvalidIndex,
00639 "Sequence is not segmented");
00640 }
00641 CSeqMap_CI iter2 = iter1;
00642 ++iter2;
00643
00644 SSeq_align_Info info(seq);
00645 ITERATE ( TSeqMapSwitchAligns, it, aligns ) {
00646 info.Add(**it);
00647 }
00648
00649 for ( ; iter2; ++iter1, ++iter2 ) {
00650 if ( iter1.GetType() == CSeqMap::eSeqRef &&
00651 iter2.GetType() == CSeqMap::eSeqRef ) {
00652 pp.push_back(x_GetSwitchPoint(seq, info, iter1, iter2));
00653 }
00654 }
00655 return pp;
00656 }
00657
00658
00659 TSeqMapSwitchPoints GetAllSwitchPoints(const CBioseq_Handle& seq)
00660 {
00661 return GetAllSwitchPoints(seq, seq.GetInst_Hist().GetAssembly());
00662 }
00663
00664
00665 END_SCOPE(objects)
00666 END_NCBI_SCOPE
00667
00668