|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/api/alignmgr2.h |
source navigation diff markup identifier search freetext search file search |
1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information (NCBI)
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government do not place any restriction on its use or reproduction.
12 * We would, however, appreciate having the NCBI and the author cited in
13 * any work or product based on this material.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * ===========================================================================
24 *
25 * File Name: alignmgr2.h
26 *
27 * Author: Sarah Wheelan
28 *
29 * Version Creation Date: 10/01
30 *
31 * $Revision: 6.21 $
32 *
33 * File Description: SeqAlign indexing, access, and manipulation functions
34 *
35 * Modifications:
36 * --------------------------------------------------------------------------
37 * $Log: alignmgr2.h,v $
38 * Revision 6.21 2003/10/09 13:46:39 rsmith
39 * Add AlnMgr2GetFirstNForSipList.
40 *
41 * Revision 6.20 2003/04/23 20:37:06 rsmith
42 * Added four functions in section 11 to allow examination of Std-Seg alignments.
43 *
44 * Revision 6.19 2003/03/31 20:17:11 todorov
45 * Added AlnMgr2IndexSeqAlignEx
46 *
47 * Revision 6.18 2002/08/07 21:57:33 kans
48 * added AlignMgr2GetFirstNForStdSeg
49 *
50 * Revision 6.17 2002/07/11 14:35:51 kans
51 * fixed Mac complaints about prototypes
52 *
53 * Revision 6.16 2002/07/11 12:55:33 wheelan
54 * added support for std-seg alignments
55 *
56 * Revision 6.15 2002/05/21 12:26:25 wheelan
57 * added n5 field to AMSmallPtr
58 *
59 * Revision 6.14 2002/04/09 18:21:55 wheelan
60 * changed params for AlnMgr2IndexAsRows
61 *
62 * Revision 6.13 2002/03/04 17:19:29 wheelan
63 * added AlnMgr2FuseSet, changed behavior of RemoveInconsistent
64 *
65 * Revision 6.12 2002/01/30 19:12:20 wheelan
66 * added RemoveInconsistentAlnsFromSet, ExtractPairwiseSeqAlign, changed behavior of GetSubAlign, changed structures and behavior of GetNextAlnBit, added GetInterruptInfo
67 *
68 * Revision 6.11 2001/12/28 22:53:46 wheelan
69 * added AlnMgr2DupAlnAndIndexes, changed amaip struct
70 *
71 * Revision 6.10 2001/12/14 12:38:35 wheelan
72 * added functions for ddv
73 *
74 * Revision 6.9 2001/11/30 16:55:07 wheelan
75 * added AlnMgr2PadConservatively
76 *
77 * Revision 6.8 2001/11/29 17:37:02 wheelan
78 * added ExtendToCoords and MergeTwoAlignments
79 *
80 * Revision 6.7 2001/11/13 14:35:33 wheelan
81 * added new field to AMSmall structure
82 *
83 * Revision 6.6 2001/11/08 19:55:32 wheelan
84 * added AlnMgr2GetNthRowSpanInSA
85 *
86 * Revision 6.5 2001/10/23 12:13:24 wheelan
87 * added #define AM_HARDSTOP
88 *
89 * Revision 6.4 2001/10/16 12:00:00 wheelan
90 * added GetParent and FreeEitherIndex
91 *
92 * Revision 6.3 2001/10/08 18:43:25 wheelan
93 * added comments
94 *
95 * Revision 6.2 2001/10/03 18:12:51 wheelan
96 * changed some colliding defines
97 *
98 * Revision 6.1 2001/10/03 14:20:30 wheelan
99 * initial checkin
100 *
101 * ==========================================================================
102 */
103
104 #ifndef _ALIGNMGR2_
105 #define _ALIGNMGR2_
106
107 #include <ncbi.h>
108 #include <sqnutils.h>
109 #include <salutil.h>
110 #include <salpedit.h>
111 #include <samutil.h>
112 #include <sequtil.h>
113
114 #undef NLM_EXTERN
115 #ifdef NLM_IMPORT
116 #define NLM_EXTERN NLM_IMPORT
117 #else
118 #define NLM_EXTERN extern
119 #endif
120
121 #ifdef __cplusplus
122 extern "C" {
123 #endif
124
125 /* defines for frequency matrix sizes */
126 #define AM_NUCSIZE 6
127 #define AM_PROTSIZE 26
128
129 /* max seqport window */
130 #define AM_SEQPORTSIZE 20000
131
132 /* defines for AlnMgr2ComputeScoreForPairwiseSeqAlign */
133 #define AM_GAPOPEN -11
134 #define AM_GAPEXT -1
135
136 /* defines for saip->indextype */
137 #define INDEX_CHILD 1
138 #define INDEX_PARENT 2
139
140 /* defines for amp->type and interrupt->type */
141 #define AM_SEQ 1
142 #define AM_GAP 2
143 #define AM_INSERT 3
144 #define AM_UNALIGNED 4
145
146 #define AM2_LEFT_TAIL 0
147 #define AM2_RIGHT_TAIL 1
148
149 #define AM2_LEFT 1
150 #define AM2_RIGHT 2
151
152 /* defines for AlnMgr2AddInNewSA */
153 #define AM_START -1
154 #define AM_STOP 1
155 #define AM_HARDSTOP 3
156
157 /* defines for amaip->alnstyle */
158 #define AM_CONTIG_LINEAR 1
159 #define AM_INTER_LINEARF 2
160 #define AM_INTER_LINEAR 3
161 #define AM_CONTIG_NONLIN 4
162 #define AM_INTER_NONLINF 5
163 #define AM_INTER_NONLIN 6
164 #define AM2_LITE 7
165 #define AM2_FULLINDEX 8
166
167 /* defines for AMEdge.used */
168 #define AM_NOTUSED 0
169 #define AM_USED 1
170 #define AM_CONFLICT 2
171 #define AM_USED2 3
172
173 typedef struct am_sarowdat {
174 Uint2Ptr sect;
175 Uint2Ptr unsect;
176 Uint2Ptr insect;
177 Uint2Ptr unaligned;
178 Uint2 numsect;
179 Uint2 numunsect;
180 Uint2 numinsect;
181 Uint2 numunaln;
182 } SARowDat2, PNTR SARowDat2Ptr;
183
184 typedef struct am_saindex {
185 Uint1 indextype;
186 SeqAlignIndexFreeFunc freefunc;
187 Uint4Ptr aligncoords;
188 Int4 anchor;
189 SARowDat2Ptr PNTR srdp;
190 Int4 numrows;
191 Int4 numseg;
192 Int4 numunaln;
193 Uint4Ptr unaln;
194 Int4 numinchain;
195 Int4 numsplitaln;
196 Int4 score;
197 Boolean aligned;
198 SeqAlignPtr top;
199 Int4 tmp;
200 } SAIndex2, PNTR SAIndex2Ptr;
201
202 NLM_EXTERN Boolean LIBCALLBACK SAIndex2Free2(VoidPtr index);
203
204 typedef struct am_interrinfo {
205 Uint1 strand;
206 Int4Ptr starts;
207 Int4Ptr lens;
208 Int4Ptr types;
209 Int4 num;
210 } AMInterrInfo, PNTR AMInterrInfoPtr;
211
212 NLM_EXTERN void AlnMgr2FreeInterruptInfo(AMInterrInfoPtr interrupt);
213
214 typedef struct am_insert {
215 Int4Ptr starts;
216 Int4Ptr lens;
217 Int4 which_side;
218 } AMInsert, PNTR AMInsertPtr;
219
220 typedef struct am_unalign {
221 Int4Ptr starts;
222 Int4Ptr lens;
223 Int4 which_side;
224 } AMUnalign, PNTR AMUnalignPtr;
225
226 typedef struct am_parcel {
227 Int4 alnstart;
228 Int4 sap_source;
229 AMUnalignPtr unaligned;
230 AMInsertPtr inserts;
231 } AMParcel, PNTR AMParcelPtr;
232
233 typedef struct am_alignindex {
234 Uint1 indextype;
235 SeqAlignIndexFreeFunc freefunc;
236 Uint2 alnstyle;
237 Int4 anchor;
238 Int4 numrows;
239 SeqIdPtr PNTR ids; /* one SeqId per row */
240 Int4 numsaps;
241 SeqAlignPtr PNTR saps;
242 Boolean PNTR aligned; /* for each sap -- is it used in the overall alignment? */
243 SeqAlignPtr sharedaln;
244 } AMAlignIndex2, PNTR AMAlignIndex2Ptr;
245
246 NLM_EXTERN Boolean LIBCALLBACK AMAlignIndex2Free2(VoidPtr index);
247
248 typedef struct {
249 Int4 insertlen;
250 Int4 unalnlen;
251 Int4 segnum;
252 Int4 row;
253 Int4 which_side;
254 } AMInterrupt, PNTR AMInterruptPtr;
255
256 typedef struct am_msg2 {
257 /* fields filled in by calling function */
258 Int4 from_aln; /* from is in alignment coordinates */
259 Int4 to_aln; /* to is in alignment coordinates */
260 Int4 row_num; /* which row the function wants to retrieve */
261
262 /* fields filled in by AlnMgr2GetNextAlnBit */
263 Int4 from_row;
264 Int4 to_row;
265 Uint1 strand;
266 Uint1 type; /* AM_SEQ or AM_GAP */
267 AMInterruptPtr left_interrupt;
268 AMInterruptPtr right_interrupt;
269
270 /* fields used internally */
271 Int4 len;
272 Int4 real_from;
273 } AlnMsg2, PNTR AlnMsg2Ptr;
274
275 NLM_EXTERN AlnMsg2Ptr AlnMsgNew2(void);
276 NLM_EXTERN AlnMsg2Ptr AlnMsgFree2(AlnMsg2Ptr amp);
277 NLM_EXTERN void AlnMsgReNew2(AlnMsg2Ptr amp);
278
279 typedef struct am_small {
280 Int4 n1;
281 Int4 n2;
282 Int4 n3;
283 Int4 n4;
284 Int4 n5;
285 struct am_small PNTR next;
286 } AM_Small2, PNTR AM_Small2Ptr;
287
288 typedef struct am_consistset {
289 Int4 numrows;
290 Int4Ptr starts;
291 Int4Ptr stops;
292 Uint1Ptr strands;
293 SeqAlignPtr sap;
294 Int4 used;
295 Int4Ptr which;
296 struct am_consistset PNTR next;
297 } AMConsSet, PNTR AMConsSetPtr;
298
299 typedef struct am_coreinf {
300 Int4 start_core;
301 Int4 len;
302 Int4 sap_num;
303 Int4 row;
304 SeqIdPtr sip;
305 Int4 start_aln;
306 Int4 left;
307 Int4 right;
308 } AM_Core, PNTR AM_CorePtr;
309
310 typedef struct am_rowinf {
311 Int4 from;
312 Int4 len;
313 struct am_rowinf PNTR next;
314 } AMRowInfo, PNTR AMRowInfoPtr;
315
316 typedef struct am_condenserow {
317 SeqIdPtr sip;
318 Uint1 strand;
319 Int4 rownum;
320 } AMCdRow, PNTR AMCdRowPtr;
321
322 typedef struct am_interval {
323 Int4 from;
324 Int4 to;
325 Uint1 strand;
326 struct am_interval PNTR next;
327 } AMInterval, PNTR AMIntervalPtr;
328
329 typedef struct am_intervalset {
330 SeqIdPtr sip;
331 AMIntervalPtr int_head;
332 Uint1 strand;
333 struct am_intervalset PNTR next;
334 } AMIntervalSet, PNTR AMIntervalSetPtr;
335
336 typedef struct am_edge {
337 Int4 vertex1;
338 Int4 vertex2;
339 Int4 weight;
340 Int4 used;
341 Boolean aligned;
342 SeqAlignPtr sap;
343 struct am_edge PNTR next;
344 } AMEdge, PNTR AMEdgePtr;
345
346 typedef struct am_vertex {
347 SeqIdPtr sip;
348 Int4 from;
349 Int4 to;
350 Uint1 strand;
351 Int4 numedges;
352 Boolean used;
353 Boolean visited;
354 struct am_vertex PNTR next;
355 } AMVertex, PNTR AMVertexPtr;
356
357 typedef struct am_queue {
358 AMVertexPtr vertex;
359 struct am_queue PNTR next;
360 } AMQueue, PNTR AMQueuePtr;
361
362 typedef struct am_segment {
363 Int4 len;
364 Int4 which_row;
365 SeqAlignPtr sap;
366 Int4 aligncoord;
367 struct am_segment PNTR next;
368 } AMSegment, PNTR AMSegmentPtr;
369
370 typedef struct am_rowstart {
371 AMSegmentPtr segment;
372 SeqIdPtr sip;
373 struct am_rowstart PNTR next;
374 } AMRowStart, PNTR AMRowStartPtr;
375
376 typedef struct am_frequency {
377 Int4Ptr PNTR freq;
378 Int4 len; /* second dimension */
379 Int4 size; /* first dimension */
380 Boolean isna;
381 } AMFreq, PNTR AMFreqPtr;
382
383 typedef struct am_bit {
384 Int4 n;
385 Int4 num1;
386 Int4 num2;
387 Int4 num3;
388 struct am_bit PNTR next;
389 } AMBitty2, PNTR AMBitty2Ptr;
390
391 /***************************************************************************
392 *
393 * SECTION 1: Functions for allocating and freeing data structures used
394 * by the alignment manager; copying functions are also here.
395 *
396 ***************************************************************************/
397 NLM_EXTERN void AMFreqFree(AMFreqPtr afp);
398 NLM_EXTERN void AMAlignIndexFreeEitherIndex(SeqAlignPtr sap);
399 NLM_EXTERN SeqAlignPtr AlnMgr2DupAlnAndIndexes(SeqAlignPtr sap);
400
401 /***************************************************************************
402 *
403 * SECTION 2: Functions used to create the indexes for parent and child
404 * seqaligns.
405 * SECTION 2a: Functions to create indexes for child seqaligns, and
406 * to convert seqaligns to dense-seg type
407 * SECTION 2b: Functions to unpack and rearrange complicated seqaligns
408 * into simple chains of dense-seg and dense-diag types
409 * SECTION 2c: Functions to create indexes for parent seqaligns
410 * SECTION 2d: Accessory functions for parent indexing
411 *
412 ***************************************************************************/
413 /* SECTION 2a */
414 /***************************************************************************
415 *
416 * AlnMgr2IndexSingleChildSeqAlign takes a simple dense-seg or dense-diag
417 * seqalign, converts it to dense-seg, and then calls
418 * AlnMgr2IndexSingleDenseSegSA to create the indexes. If the alignment has
419 * already been indexed, this erases that index and reindexes the alignment.
420 *
421 ***************************************************************************/
422 NLM_EXTERN Boolean AlnMgr2IndexSingleChildSeqAlign(SeqAlignPtr sap);
423
424 /* SECTION 2c */
425 /***************************************************************************
426 *
427 * AlnMgr2IndexLite takes a seqalign or a list of seqaligns, converts
428 * each alignment to a dense-seg structure and indexes it, and then
429 * allocates an AMAlignIndex2 structure and fills in the saps array.
430 *
431 ***************************************************************************/
432 NLM_EXTERN Boolean AlnMgr2IndexLite(SeqAlignPtr sap);
433
434 /***************************************************************************
435 *
436 * AlnMgr2IndexSeqAlign takes a seqalign of any type except std-seg and
437 * creates indexes on it for easy retrieval of useful information by other
438 * AlnMgr2 functions. If the seqalign is a single alignment, that alignment
439 * gets a simple index and is left alone otherwise. If the seqalign is
440 * a set of alignments or a dense-diag set, the subalignments get
441 * individually indexed and then are combined into a (fake) multiple
442 * alignment which also gets indexed. The subalignments can now be accessed
443 * as a multiple alignment by AlnMgr2 functions.
444 *
445 ***************************************************************************/
446 NLM_EXTERN void AlnMgr2IndexSeqAlign(SeqAlignPtr sap);
447 NLM_EXTERN void AlnMgr2IndexSeqAlignEx(SeqAlignPtr sap, Boolean replace_gi);
448
449 /***************************************************************************
450 *
451 * AlnMgr2ReIndexSeqAlign takes an indexed alignment (that has, presumably,
452 * been changed), makes sure all child seqaligns are indexed (if they are
453 * already indexed they are not reindexed), and reindexes all the child
454 * seqaligns as a set.
455 *
456 ***************************************************************************/
457 NLM_EXTERN void AlnMgr2ReIndexSeqAlign(SeqAlignPtr sap);
458
459 NLM_EXTERN Boolean AlnMgr2IndexAsRows(SeqAlignPtr sap, Uint1 strand, Boolean truncate);
460
461 /***************************************************************************
462 *
463 * AlnMgr2IndexIndexedChain takes a linked list of indexed seqaligns
464 * and does an in-place transformation to an indexed parent-child
465 * seqalign set.
466 *
467 ***************************************************************************/
468 NLM_EXTERN void AlnMgr2IndexIndexedChain(SeqAlignPtr sap);
469
470 /***************************************************************************
471 *
472 * SECTION 3: Functions for debugging
473 *
474 ***************************************************************************/
475 NLM_EXTERN void am_print_sa_index(SeqAlignPtr sap, FILE *ofp);
476 NLM_EXTERN void AlnMgr2PrintSeqAlign(SeqAlignPtr sap, Int4 linesize, Boolean isnuc, FILE * ofp);
477
478 /***************************************************************************
479 *
480 * SECTION 4: API-level functions (and their helper functions) used to
481 * access an indexed alignment.
482 * SECTION 4a: AlnMgr2GetNextAlnBit and associated functions
483 * SECTION 4b: "GetNth" functions
484 * SECTION 4c: other functions for accessing the alignment
485 *
486 ***************************************************************************/
487 /* SECTION 4a */
488 /***************************************************************************
489 *
490 * AlnMgr2GetNextAlnBit takes an indexed seqalign and returns it, piece
491 * by piece, in the row and across the range specified in the AlnMsg
492 * structure. amp->from_aln and amp->to_aln must be filled in; these are
493 * in alignment coordinates. AlnMgr2GetNextAlnBit will return the AlnMsg
494 * structure with amp->from_row and amp->to_row filled in. If amp->type is
495 * AM_SEQ, these numbers are sequence coordinates; if amp->type is AM_GAP
496 * the numbers are alignment coordinates and there is a gap in that row.
497 * AlnMgr2GetNextAlnBit returns one continuous piece of sequence or gap
498 * at each call, and keeps returning TRUE until it has returned all the
499 * information for the piece of the alignment requested.
500 *
501 ***************************************************************************/
502 NLM_EXTERN Boolean AlnMgr2GetNextAlnBit(SeqAlignPtr sap, AlnMsg2Ptr amp);
503
504 /* SECTION 4a */
505 /***************************************************************************
506 *
507 * AlnMgr2GetInterruptInfo returns a structure describing the inserts and
508 * unaligned regions in an interrupt. The structure is allocated by this
509 * function and must be freed with AlnMgr2FreeInterruptInfo.
510 *
511 ***************************************************************************/
512 NLM_EXTERN AMInterrInfoPtr AlnMgr2GetInterruptInfo(SeqAlignPtr sap, AMInterruptPtr interrupt);
513
514 /* SECTION 4b */
515 /***************************************************************************
516 *
517 * AlnMgr2GetNthStrand takes an indexed seqalign and a row number and
518 * returns the strand of the row indicated. A return of 0 indicates
519 * an error.
520 *
521 ***************************************************************************/
522 NLM_EXTERN Uint1 AlnMgr2GetNthStrand(SeqAlignPtr sap, Int4 n);
523
524 /***************************************************************************
525 *
526 * AlnMgr2GetNthSeqIdPtr returns the seqid (this is a duplicated,
527 * allocated seqid that must be freed) of the nth row (1-based) of an
528 * indexed parent or child seqalign.
529 *
530 ***************************************************************************/
531 NLM_EXTERN SeqIdPtr AlnMgr2GetNthSeqIdPtr(SeqAlignPtr sap, Int4 n);
532
533 /***************************************************************************
534 *
535 * AlnMgr2GetNthSeqRangeInSA returns the smallest and largest sequence
536 * coordinates contained in the nth row of an indexed seqalign. Either
537 * start or stop can be NULL to only retrieve one of the coordinates.
538 * If start and stop are -1, there is an error; if they are both -2, the
539 * row is just one big insert. RANGE
540 *
541 ***************************************************************************/
542 NLM_EXTERN void AlnMgr2GetNthSeqRangeInSA(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
543
544 /***************************************************************************
545 *
546 * AlnMgr2GetNthRowSpanInSA returns the least and greatest alignment
547 * coordinates (inclusive) spanned by the indicated row. Either stop or
548 * start can be NULL to retrieve just one of the coordinates.
549 *
550 ***************************************************************************/
551 NLM_EXTERN void AlnMgr2GetNthRowSpanInSA(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
552
553 NLM_EXTERN Int4 AlnMgr2GetMaxTailLength(SeqAlignPtr sap, Uint1 which_tail);
554
555 /***************************************************************************
556 *
557 * AlnMgr2GetNthRowTail returns the sequence extremities that are not
558 * contained in the alignment (if the alignment starts at 10 in row 2, the
559 * tail in that row is 0-9). It takes an indexed seqalign, a 1-based row
560 * number, and AM2_LEFT_TAIL or AM2_RIGHT_TAIL, and returns the start, stop,
561 * and strand of the tail indicated in the row desired. AlnMgr2GetNthRowTail
562 * returns TRUE if the calculations were successfully completed.
563 *
564 ***************************************************************************/
565 NLM_EXTERN Boolean AlnMgr2GetNthRowTail(SeqAlignPtr sap, Int4 n, Uint1 which_tail, Int4Ptr start, Int4Ptr stop, Uint1Ptr strand);
566
567 /* SECTION 4c */
568 /***************************************************************************
569 *
570 * AlnMgr2GetAlnLength returns the total alignment length of an indexed
571 * alignment. If fill_in is TRUE, the function computes the total length
572 * of all the internal unaligned regions and adds that to the alignment
573 * length; otherwise only the aligned portions are considered.
574 *
575 ***************************************************************************/
576 NLM_EXTERN Int4 AlnMgr2GetAlnLength(SeqAlignPtr sap, Boolean fill_in);
577
578 /* SECTION 4c functions for DDV */
579 NLM_EXTERN Boolean AlnMgr2IsSAPDiscAli(SeqAlignPtr sap);
580 NLM_EXTERN Int4 AlnMgr2GetNumAlnBlocks(SeqAlignPtr sap);
581 NLM_EXTERN Boolean AlnMgr2GetNthBlockRange(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
582
583 /***************************************************************************
584 *
585 * AlnMgr2GetNthUnalignedForNthRow returns the bioseq coordinates for the
586 * requested row, in the requested unaligned region. Any error will result
587 * in -1 returns for both start and stop.
588 *
589 ***************************************************************************/
590 NLM_EXTERN Boolean AlnMgr2GetNthUnalignedForNthRow(SeqAlignPtr sap, Int4 unaligned, Int4 row, Int4Ptr start, Int4Ptr stop);
591
592 /***************************************************************************
593 *
594 * AlnMgr2GetNextLengthBit is called in a loop on an indexed alignment, with
595 * seg starting at 0, to return the lengths of the aligned and unaligned
596 * regions. If the length returned is negative, it's an unaligned region;
597 * otherwise it's aligned.
598 *
599 ***************************************************************************/
600 NLM_EXTERN Boolean AlnMgr2GetNextLengthBit(SeqAlignPtr sap, Int4Ptr len, Int4Ptr seg);
601
602 /***************************************************************************
603 *
604 * AlnMgr2GetNumRows returns the number of rows in an indexed seqalign.
605 *
606 ***************************************************************************/
607 NLM_EXTERN Int4 AlnMgr2GetNumRows(SeqAlignPtr sap);
608
609 /***************************************************************************
610 *
611 * AlnMgr2GetNumSegs returns the number of gap- or aligned- contiguous
612 * segments in the alignment (continuous or not).
613 *
614 ***************************************************************************/
615 NLM_EXTERN Int4 AlnMgr2GetNumSegs(SeqAlignPtr sap);
616
617 /***************************************************************************
618 *
619 * AlnMgr2GetNumSegsInRange returns the number of alignment segments
620 * spanned by the given range (partially or fully). The range is
621 * given in alignment coordinates.
622 *
623 ***************************************************************************/
624 NLM_EXTERN Int4 AlnMgr2GetNumSegsInRange(SeqAlignPtr sap, Int4 start, Int4 stop, Int4Ptr start_seg);
625
626 /***************************************************************************
627 *
628 * AlnMgr2GetNthSegmentRange returns the alignment coordinate range of the
629 * Nth segment (count starts at 1) of the seqalign. start and stop are
630 * optional arguments (in case only one end is desired).
631 *
632 ***************************************************************************/
633 NLM_EXTERN void AlnMgr2GetNthSegmentRange(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
634
635 /***************************************************************************
636 *
637 * AlnMgr2GetFirstNForSip returns the first row that a seqid occurs on,
638 * or -1 if the seqid is not in the alignment or if there is another
639 * error.
640 *
641 ***************************************************************************/
642 NLM_EXTERN Int4 AlnMgr2GetFirstNForSip(SeqAlignPtr sap, SeqIdPtr sip);
643
644 /***************************************************************************
645 *
646 * AlnMgr2GetFirstNForSipList returns the first row that one of a list of seqids occur on,
647 * or -1 if none of the seqids are in the alignment or if there is another
648 * error.
649 * Handy if sip comes from a BioSeq, where it can point to a linked list
650 * of SeqIds.
651 *
652 ***************************************************************************/
653 NLM_EXTERN Int4 AlnMgr2GetFirstNForSipList(SeqAlignPtr sap, SeqIdPtr sip);
654
655 /***************************************************************************
656 *
657 * AlnMgr2GetParent returns the top-level seqalign associated with a given
658 * indexed alignment. It returns the actual pointer, not a copy.
659 *
660 ***************************************************************************/
661 NLM_EXTERN SeqAlignPtr AlnMgr2GetParent(SeqAlignPtr sap);
662
663 /***************************************************************************
664 *
665 * SECTION 5: Functions to change, assign or retrieve an anchor row.
666 * SECTION 5a: functions for child seqaligns
667 * SECTION 5b: functions for parent seqaligns
668 * SECTION 5c: functions to retrieve anchor row information
669 *
670 ***************************************************************************/
671 /* SECTION 5b */
672
673 /***************************************************************************
674 *
675 * AlnMgr2AnchorSeqAlign takes an indexed seqalign and a row (1-based) and
676 * reindexes the alignment so that there are no gaps in the row indicated.
677 * Other rows may contain inserts after this operation. After an alignment
678 * is anchored, its length often shrinks.
679 *
680 ***************************************************************************/
681 NLM_EXTERN void AlnMgr2AnchorSeqAlign(SeqAlignPtr sap, Int4 which_row);
682
683 /* SECTION 5c */
684 /***************************************************************************
685 *
686 * AlnMgr2FindAnchor returns the row number (1-based) of the anchor row
687 * for an indexed seqalign, or -1 if the alignment is unanchored or if
688 * there is another type of error.
689 *
690 ***************************************************************************/
691 NLM_EXTERN Int4 AlnMgr2FindAnchor(SeqAlignPtr sap);
692
693 /***************************************************************************
694 *
695 * SECTION 6: Functions for coordinate conversion (bioseq to seqalign
696 * coordinates and vice versa)
697 *
698 ***************************************************************************/
699
700 /***************************************************************************
701 *
702 * AlnMgr2MapBioseqToSeqAlign takes an indexed seqalign, a position in a
703 * row of the alignment, and a 1-based row number, and maps the row position
704 * to alignment coordinates.
705 *
706 ***************************************************************************/
707 NLM_EXTERN Int4 AlnMgr2MapBioseqToSeqAlign(SeqAlignPtr sap, Int4 pos, Int4 row);
708
709 /***************************************************************************
710 *
711 * AlnMgr2MapSeqAlignToBioseq takes an indexed seqalign, an alignment
712 * coordinate (pos), and the 1-based number of a row, and maps the alignment
713 * coordinate to the corresponding bioseq coordinate of the row desired.
714 * A return of -1 indicates an error; a return of -2 means that the bioseq
715 * is gapped at this alignment position.
716 *
717 ***************************************************************************/
718 NLM_EXTERN Int4 AlnMgr2MapSeqAlignToBioseq(SeqAlignPtr sap, Int4 pos, Int4 row);
719
720 /***************************************************************************
721 *
722 * AlnMgr2MapRowToRow takes an indexed seqalign, a position in row1, the
723 * 1-based number of row1, and a target row (row2), and maps the bioseq
724 * coordinate in row 1 to the corresponding (aligned) bioseq coordinate in
725 * row2. A return of -1 indicates an error while a return of -2 means that
726 * the bioseq in row2 is gapped at the desired position.
727 *
728 ***************************************************************************/
729 NLM_EXTERN Int4 AlnMgr2MapRowToRow(SeqAlignPtr sap, Int4 pos, Int4 row1, Int4 row2);
730
731 /***************************************************************************
732 *
733 * SECTION 7: Functions to change an alignment and retrieve parts of an
734 * alignment
735 *
736 ***************************************************************************/
737 /***************************************************************************
738 *
739 * AlnMgr2TruncateSeqAlign truncates a given seqalign to contain only the
740 * bioseq coordinates from start to stop on the indicated row. Anything
741 * before those coordinates is discarded; anything remaining afterwards
742 * is made into another seqalign and put in sap->next (the original next,
743 * if any, is now at sap->next->next). Doesn't work on parent seqaligns.
744 * The function returns TRUE if the orignal alignment extended past stop.
745 *
746 ***************************************************************************/
747 NLM_EXTERN Boolean AlnMgr2TruncateSeqAlign(SeqAlignPtr sap, Int4 start, Int4 stop, Int4 row);
748
749 /***************************************************************************
750 *
751 * AlnMgr2GetSubAlign retrieves a portion of an indexed alignment, from
752 * 'from' to 'to' in the row coordinates specified, or if which_row is 0,
753 * 'from' and 'to' are assumed to be alignment coordinates. If 'to' is -1,
754 * the subalignment will go to the end of the specified row (or to the end
755 * of the whole alignment). If the alignment is discontinuous and fill_in
756 * is FALSE, the alignment will be returned as an SAS_DISC set, each piece
757 * represented by a single alignment. If the alignment is discontinuous and
758 * fill_in is TRUE, the unaligned regions will be added in to the alignment,
759 * with all gaps in all other rows. If the alignment is continuous, it
760 * doesn't matter whether fill_in is TRUE or FALSE. (SUBALIGN)
761 *
762 ***************************************************************************/
763 NLM_EXTERN SeqAlignPtr AlnMgr2GetSubAlign(SeqAlignPtr sap, Int4 from, Int4 to, Int4 which_row, Boolean fill_in);
764
765 /***************************************************************************
766 *
767 * SECTION 8: Miscellaneous functions to compute useful information
768 * about an alignment
769 *
770 ***************************************************************************/
771 /***************************************************************************
772 *
773 * AlnMgr2ComputeScoreForSeqAlign computes an ad hoc numerical score for
774 * an indexed alignment by computing a similarity score for the whole
775 * alignment (residue pair by residue pair score, from a matrix for proteins
776 * and identity for nucleotides) and then subtracting gap open and gap
777 * extension penalties.
778 *
779 ***************************************************************************/
780 NLM_EXTERN Int4 AlnMgr2ComputeScoreForSeqAlign(SeqAlignPtr sap);
781
782 /***************************************************************************
783 *
784 * AlnMgr2ComputeFreqMatrix takes an indexed seqalign and returns a matrix
785 * indicating nucleotide or amino acid frequency at each position of the
786 * alignment. The matrix can be made over only a part of the alignment, if
787 * from and to are nonzero, and if row is nonzero, from and to are taken
788 * to be bioseq coordinates from that row (if row == 0 from and to are
789 * assumed to be alignment coordinates).
790 *
791 ***************************************************************************/
792 NLM_EXTERN AMFreqPtr AlnMgr2ComputeFreqMatrix(SeqAlignPtr sap, Int4 from, Int4 to, Int4 row);
793
794 /***************************************************************************
795 *
796 * AlnMgr2IsItProtein takes an indexed alignment and quickly decides if
797 * it's a protein or nucleotide alignment, returning TRUE for protein.
798 *
799 ***************************************************************************/
800 NLM_EXTERN Boolean AlnMgr2IsItProtein(SeqAlignPtr sap);
801
802 /***************************************************************************
803 *
804 * SECTION 9: Sorting functions and other algorithms to help order
805 * alignments for various purposes
806 *
807 ***************************************************************************/
808 /***************************************************************************
809 *
810 * AlnMgr2SortAlnSetByNthRowPos takes an indexed parent alignment and sorts
811 * all the child alignments along the row indicated. If the indicated row
812 * is aligned on the plus strand, the alignments are sorted from smaller
813 * to larger coordinates along that row; otherwise they are sorted in
814 * reverse order.
815 *
816 ***************************************************************************/
817 NLM_EXTERN void AlnMgr2SortAlnSetByNthRowPos(SeqAlignPtr sap, Int4 row);
818
819 /***************************************************************************
820 *
821 * SECTION 10: Basic alignment operations
822 *
823 ***************************************************************************/
824
825 /***************************************************************************
826 *
827 * AlnMgr2MergeTwoAlignments takes two alignments, with identical rows in
828 * the same order (otherwise it rejects the alignments), and merges them
829 * into a single alignment. If there is unaligned space between the two
830 * alignments and this space is the same length for every row, the function
831 * aligns those sequences; it rejects alignments when the unaligned spaces
832 * are different sizes. The function returns a newly allocated alignment.
833 *
834 ***************************************************************************/
835 NLM_EXTERN SeqAlignPtr AlnMgr2MergeTwoAlignments(SeqAlignPtr sap1_orig, SeqAlignPtr sap2_orig);
836
837 /***************************************************************************
838 *
839 * AlnMgr2ExtendToCoords takes an indexed child seqalign and blindly extends
840 * it to the coordinates specified on the given row. If other rows are too
841 * short to allow this extension, the alignment is extended as far as
842 * possible. If to == -1 the extension goes to the end of the sequence
843 * specified.
844 *
845 ***************************************************************************/
846 NLM_EXTERN void AlnMgr2ExtendToCoords(SeqAlignPtr sap, Int4 from, Int4 to, Int4 row);
847
848 /***************************************************************************
849 *
850 * AlnMgr2PadConservatively extends an alignment so that the whole of
851 * all sequences is included. If two sequences have tails on the same
852 * side, they are each aligned with columns of all gaps:
853 *
854 * <-new aln region->
855 * xxxxxxxx----------xxxxxxxxxxxxxxxxxxxx
856 * --------xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
857 *
858 * This function returns a newly allocated alignment and doesn't change
859 * the original (except for indexing). If the extension was not done for
860 * some reason, the function returns NULL;
861 *
862 ***************************************************************************/
863 NLM_EXTERN SeqAlignPtr AlnMgr2PadConservatively(SeqAlignPtr sap);
864
865 /***************************************************************************
866 *
867 * AlnMgr2ExtractPairwiseSeqAlign takes an indexed alignment (parent or
868 * child, but must be fully indexed, not lite) and extracts a pairwise
869 * subalignment containing the two requested rows. The subalignment is
870 * unindexed and may have internal unaligned regions.
871 *
872 ***************************************************************************/
873 NLM_EXTERN SeqAlignPtr AlnMgr2ExtractPairwiseSeqAlign(SeqAlignPtr sap, Int4 n1, Int4 n2);
874
875 /***************************************************************************
876 *
877 * AlnMgr2RemoveInconsistentAlnsFromSet takes an alignment that is
878 * indexed at least at the AM2_LITE level, and prunes the child
879 * alignments so that the remaining alignments form a consistent,
880 * nonoverlapping set. All alignments must have the same number of rows,
881 * and they must be the same rows (although not necessarily in the same
882 * order). The function uses a simple greedy algorithm to construct the
883 * nonoverlapping set, starting with the highest-scoring alignment.
884 * If fuzz is negative, the function creates the best nonoverlapping set
885 * by actually truncating alignments.
886 *
887 ***************************************************************************/
888 NLM_EXTERN void AlnMgr2RemoveInconsistentAlnsFromSet(SeqAlignPtr sap_head, Int4 fuzz);
889
890 /***************************************************************************
891 *
892 * AlnMgr2FuseSet takes a set of alignments sharing all their rows and orders
893 * the alignments, then fuses together any adjacent alignments. If returnall
894 * is TRUE, all pieces are returned; if not, then only the largest piece is
895 * returned. This function will work best when called after
896 * AlnMgr2RemoveInconsistentAlnsFromSet(sap_head, -1).
897 *
898 ***************************************************************************/
899 NLM_EXTERN SeqAlignPtr AlnMgr2FuseSet(SeqAlignPtr sap_head, Boolean returnall);
900
901 /* SECTION 11 -- functions for std-segs */
902 NLM_EXTERN Int4 AlignMgr2GetFirstNForStdSeg(SeqAlignPtr sap, SeqIdPtr sip);
903 NLM_EXTERN SeqIdPtr AlnMgr2GetNthSeqIdPtrStdSeg(SeqAlignPtr sap, Int4 n);
904 NLM_EXTERN void AlnMgr2GetNthSeqRangeInSAStdSeg(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
905
906 /***************************************************************************
907 *
908 * AlnMgr2GetSeqRangeForSipInSAStdSeg returns the smallest and largest sequence
909 * coordinates in in a Std-Seg seqalign for a given Sequence Id. Also return the
910 * strand type if it is the same on every segment, else set it to Seq_strand_unknown.
911 * Either start, stop or strand can be NULL to only retrieve some of them.
912 * If start and stop are -1, there is an error (not a std-seg), the SeqID does not participate in this
913 * alignment or the alignment is one big insert on that id. Returns true if the sip was found
914 * in the alignment with real coordinates, i.e. *start would not be -1. RANGE
915 *
916 ***************************************************************************/
917 NLM_EXTERN Boolean AlnMgr2GetSeqRangeForSipInSAStdSeg(SeqAlignPtr sap, SeqIdPtr sip, Int4Ptr start, Int4Ptr stop, Uint1Ptr strand);
918
919 /***************************************************************************
920 *
921 * AlnMgr2GetSeqRangeForSipInStdSeg returns the start and stop sequence
922 * coordinates in a Std-Segment for a given Sequence Id. Also return the
923 * strand type. Either start, stop, strand or segType can be NULL to only retrieve some of them.
924 * Returns false if the SeqID was not found in this segment, so no meaningful
925 * data was passed back in other arguments.
926 * Returns true if the sip was found, even if it is a gap (start, stop = -1).
927 * segType is set to AM_SEQ if the SeqID Sequence is not empty and one of
928 * the other sequences aligned with it is also not empty. To AM_GAP if
929 * the other sequences are all empty, and to AM_INSERT if the main sequence
930 * is empty.
931 * RANGE
932 *
933 ***************************************************************************/
934 NLM_EXTERN Boolean AlnMgr2GetSeqRangeForSipInStdSeg(
935 StdSegPtr ssp,
936 SeqIdPtr sip,
937 Int4Ptr start,
938 Int4Ptr stop,
939 Uint1Ptr strand,
940 Uint1Ptr segType); /* AM_SEQ, AM_GAP, AM_INSERT */
941
942 /***************************************************************************
943 *
944 * AlnMgr2GetNthStdSeg returns the a pointer to the Nth segment of
945 * a standard segment alignment.
946 * returns NULL if not n segments or is not a std-seg aligment.
947 * Useful to pass its return value to AlnMgr2GetSeqRangeForSipInStdSeg()
948 *
949 ***************************************************************************/
950 NLM_EXTERN StdSegPtr AlnMgr2GetNthStdSeg(SeqAlignPtr sap, Int2 n);
951
952 /***************************************************************************
953 *
954 * AlnMgr2GetNumStdSegs returns the number of segments in a standar-seg alignment.
955 * returns -1 if sap is null or not a standard-seg alignment.
956 * the Std-Seg version of AlnMgr2GetNumSegs
957 *
958 ***************************************************************************/
959 NLM_EXTERN Int4 AlnMgr2GetNumStdSegs(SeqAlignPtr sap);
960
961 /***************************************************************************
962 *
963 * The two mapping functions act a little differently for std-segs. The
964 * alignment coordinates are 1:1 linearly correlated with the longest
965 * seqloc in the set; the others may be significantly shorter.
966 * The mapping functions deal with % lengths, and map those instead of
967 * coordinates (which may not be linear);
968 *
969 ***************************************************************************/
970 NLM_EXTERN Int4 AlnMgr2MapBioseqToSeqAlignStdSeg(SeqAlignPtr sap, Int4 n, Int4 pos);
971 NLM_EXTERN Int4 AlnMgr2MapSeqAlignToBioseqStdSeg(SeqAlignPtr sap, Int4 n, Int4 pos);
972 NLM_EXTERN Int4 AlnMgr2GetAlnLengthStdSeg(SeqAlignPtr sap);
973
974 /***************************************************************************/
975
976
977 #ifdef __cplusplus
978 }
979 #endif
980
981 #undef NLM_EXTERN
982 #ifdef NLM_EXPORT
983 #define NLM_EXTERN NLM_EXPORT
984 #else
985 #define NLM_EXTERN
986 #endif
987
988 #endif
989 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |