NCBI C Toolkit Cross Reference

C/api/alignmgr2.h


  1 /* ===========================================================================
  2 *
  3 *                            PUBLIC DOMAIN NOTICE
  4 *            National Center for Biotechnology Information (NCBI)
  5 *
  6 *  This software/database is a "United States Government Work" under the
  7 *  terms of the United States Copyright Act.  It was written as part of
  8 *  the author's official duties as a United States Government employee and
  9 *  thus cannot be copyrighted.  This software/database is freely available
 10 *  to the public for use. The National Library of Medicine and the U.S.
 11 *  Government do not place any restriction on its use or reproduction.
 12 *  We would, however, appreciate having the NCBI and the author cited in
 13 *  any work or product based on this material.
 14 *
 15 *  Although all reasonable efforts have been taken to ensure the accuracy
 16 *  and reliability of the software and data, the NLM and the U.S.
 17 *  Government do not and cannot warrant the performance or results that
 18 *  may be obtained by using this software or data. The NLM and the U.S.
 19 *  Government disclaim all warranties, express or implied, including
 20 *  warranties of performance, merchantability or fitness for any particular
 21 *  purpose.
 22 *
 23 * ===========================================================================
 24 *
 25 * File Name:  alignmgr2.h
 26 *
 27 * Author:  Sarah Wheelan
 28 *
 29 * Version Creation Date:  10/01 
 30 *
 31 * $Revision: 6.21 $
 32 *
 33 * File Description: SeqAlign indexing, access, and manipulation functions
 34 *
 35 * Modifications:
 36 * --------------------------------------------------------------------------
 37 * $Log: alignmgr2.h,v $
 38 * Revision 6.21  2003/10/09 13:46:39  rsmith
 39 * Add AlnMgr2GetFirstNForSipList.
 40 *
 41 * Revision 6.20  2003/04/23 20:37:06  rsmith
 42 * Added four functions in section 11 to allow examination of Std-Seg alignments.
 43 *
 44 * Revision 6.19  2003/03/31 20:17:11  todorov
 45 * Added AlnMgr2IndexSeqAlignEx
 46 *
 47 * Revision 6.18  2002/08/07 21:57:33  kans
 48 * added AlignMgr2GetFirstNForStdSeg
 49 *
 50 * Revision 6.17  2002/07/11 14:35:51  kans
 51 * fixed Mac complaints about prototypes
 52 *
 53 * Revision 6.16  2002/07/11 12:55:33  wheelan
 54 * added support for std-seg alignments
 55 *
 56 * Revision 6.15  2002/05/21 12:26:25  wheelan
 57 * added n5 field to AMSmallPtr
 58 *
 59 * Revision 6.14  2002/04/09 18:21:55  wheelan
 60 * changed params for AlnMgr2IndexAsRows
 61 *
 62 * Revision 6.13  2002/03/04 17:19:29  wheelan
 63 * added AlnMgr2FuseSet, changed behavior of RemoveInconsistent
 64 *
 65 * Revision 6.12  2002/01/30 19:12:20  wheelan
 66 * added RemoveInconsistentAlnsFromSet, ExtractPairwiseSeqAlign, changed behavior of GetSubAlign, changed structures and behavior of GetNextAlnBit, added GetInterruptInfo
 67 *
 68 * Revision 6.11  2001/12/28 22:53:46  wheelan
 69 * added AlnMgr2DupAlnAndIndexes, changed amaip struct
 70 *
 71 * Revision 6.10  2001/12/14 12:38:35  wheelan
 72 * added functions for ddv
 73 *
 74 * Revision 6.9  2001/11/30 16:55:07  wheelan
 75 * added AlnMgr2PadConservatively
 76 *
 77 * Revision 6.8  2001/11/29 17:37:02  wheelan
 78 * added ExtendToCoords and MergeTwoAlignments
 79 *
 80 * Revision 6.7  2001/11/13 14:35:33  wheelan
 81 * added new field to AMSmall structure
 82 *
 83 * Revision 6.6  2001/11/08 19:55:32  wheelan
 84 * added AlnMgr2GetNthRowSpanInSA
 85 *
 86 * Revision 6.5  2001/10/23 12:13:24  wheelan
 87 * added #define AM_HARDSTOP
 88 *
 89 * Revision 6.4  2001/10/16 12:00:00  wheelan
 90 * added GetParent and FreeEitherIndex
 91 *
 92 * Revision 6.3  2001/10/08 18:43:25  wheelan
 93 * added comments
 94 *
 95 * Revision 6.2  2001/10/03 18:12:51  wheelan
 96 * changed some colliding defines
 97 *
 98 * Revision 6.1  2001/10/03 14:20:30  wheelan
 99 * initial checkin
100 *
101 * ==========================================================================
102 */
103 
104 #ifndef _ALIGNMGR2_
105 #define _ALIGNMGR2_
106 
107 #include <ncbi.h>
108 #include <sqnutils.h>
109 #include <salutil.h>
110 #include <salpedit.h>
111 #include <samutil.h>
112 #include <sequtil.h>
113 
114 #undef NLM_EXTERN
115 #ifdef NLM_IMPORT
116 #define NLM_EXTERN NLM_IMPORT
117 #else
118 #define NLM_EXTERN extern
119 #endif
120 
121 #ifdef __cplusplus
122 extern "C" {
123 #endif
124 
125 /* defines for frequency matrix sizes */
126 #define AM_NUCSIZE   6
127 #define AM_PROTSIZE  26
128 
129 /* max seqport window */
130 #define AM_SEQPORTSIZE 20000
131 
132 /* defines for AlnMgr2ComputeScoreForPairwiseSeqAlign */
133 #define AM_GAPOPEN  -11
134 #define AM_GAPEXT   -1
135 
136 /* defines for saip->indextype */
137 #define INDEX_CHILD   1
138 #define INDEX_PARENT  2
139 
140 /* defines for amp->type and interrupt->type */
141 #define AM_SEQ        1
142 #define AM_GAP        2
143 #define AM_INSERT     3
144 #define AM_UNALIGNED  4
145 
146 #define AM2_LEFT_TAIL   0
147 #define AM2_RIGHT_TAIL  1
148 
149 #define AM2_LEFT        1
150 #define AM2_RIGHT       2
151 
152 /* defines for AlnMgr2AddInNewSA */
153 #define AM_START    -1
154 #define AM_STOP      1
155 #define AM_HARDSTOP  3
156 
157 /* defines for amaip->alnstyle */
158 #define AM_CONTIG_LINEAR  1
159 #define AM_INTER_LINEARF  2
160 #define AM_INTER_LINEAR   3
161 #define AM_CONTIG_NONLIN  4
162 #define AM_INTER_NONLINF  5
163 #define AM_INTER_NONLIN   6
164 #define AM2_LITE          7
165 #define AM2_FULLINDEX     8
166 
167 /* defines for AMEdge.used */
168 #define AM_NOTUSED   0
169 #define AM_USED      1
170 #define AM_CONFLICT  2
171 #define AM_USED2     3
172 
173 typedef struct am_sarowdat {
174    Uint2Ptr  sect;
175    Uint2Ptr  unsect;
176    Uint2Ptr  insect;
177    Uint2Ptr  unaligned;
178    Uint2     numsect;
179    Uint2     numunsect;
180    Uint2     numinsect;
181    Uint2     numunaln;
182 } SARowDat2, PNTR SARowDat2Ptr;
183 
184 typedef struct am_saindex {
185    Uint1                  indextype;
186    SeqAlignIndexFreeFunc  freefunc;
187    Uint4Ptr               aligncoords;
188    Int4                   anchor;
189    SARowDat2Ptr           PNTR srdp;
190    Int4                   numrows;
191    Int4                   numseg;
192    Int4                   numunaln;
193    Uint4Ptr               unaln;
194    Int4                   numinchain;
195    Int4                   numsplitaln;
196    Int4                   score;
197    Boolean                aligned;
198    SeqAlignPtr            top;
199    Int4                   tmp;
200 } SAIndex2, PNTR SAIndex2Ptr;
201 
202 NLM_EXTERN Boolean LIBCALLBACK SAIndex2Free2(VoidPtr index);
203 
204 typedef struct am_interrinfo {
205    Uint1    strand;
206    Int4Ptr  starts;
207    Int4Ptr  lens;
208    Int4Ptr  types;
209    Int4     num;
210 } AMInterrInfo, PNTR AMInterrInfoPtr;
211 
212 NLM_EXTERN void AlnMgr2FreeInterruptInfo(AMInterrInfoPtr interrupt);
213 
214 typedef struct am_insert {
215    Int4Ptr  starts;
216    Int4Ptr  lens;
217    Int4     which_side;
218 } AMInsert, PNTR AMInsertPtr;
219 
220 typedef struct am_unalign {
221    Int4Ptr  starts;
222    Int4Ptr  lens;
223    Int4     which_side;
224 } AMUnalign, PNTR AMUnalignPtr;
225 
226 typedef struct am_parcel {
227    Int4          alnstart;
228    Int4          sap_source;
229    AMUnalignPtr  unaligned;
230    AMInsertPtr   inserts;
231 } AMParcel, PNTR AMParcelPtr;
232    
233 typedef struct am_alignindex {
234    Uint1                  indextype;
235    SeqAlignIndexFreeFunc  freefunc;
236    Uint2                  alnstyle;
237    Int4                   anchor;
238    Int4                   numrows;
239    SeqIdPtr               PNTR ids;  /* one SeqId per row */
240    Int4                   numsaps;
241    SeqAlignPtr            PNTR saps;
242    Boolean                PNTR aligned; /* for each sap -- is it used in the overall alignment? */
243    SeqAlignPtr            sharedaln;
244 } AMAlignIndex2, PNTR AMAlignIndex2Ptr;
245 
246 NLM_EXTERN Boolean LIBCALLBACK AMAlignIndex2Free2(VoidPtr index);
247 
248 typedef struct {
249    Int4  insertlen;
250    Int4  unalnlen;
251    Int4  segnum;
252    Int4  row;
253    Int4  which_side;
254 } AMInterrupt, PNTR AMInterruptPtr;
255 
256 typedef struct am_msg2 {
257 /* fields filled in by calling function */
258    Int4            from_aln; /* from is in alignment coordinates */
259    Int4            to_aln;   /* to is in alignment coordinates */
260    Int4            row_num; /* which row the function wants to retrieve */
261 
262 /* fields filled in by AlnMgr2GetNextAlnBit */
263    Int4            from_row;
264    Int4            to_row;
265    Uint1           strand;
266    Uint1           type; /* AM_SEQ or AM_GAP */
267    AMInterruptPtr  left_interrupt;
268    AMInterruptPtr  right_interrupt;
269 
270 /* fields used internally */
271    Int4            len;
272    Int4            real_from;
273 } AlnMsg2, PNTR AlnMsg2Ptr;
274 
275 NLM_EXTERN AlnMsg2Ptr AlnMsgNew2(void);
276 NLM_EXTERN AlnMsg2Ptr AlnMsgFree2(AlnMsg2Ptr amp);
277 NLM_EXTERN void AlnMsgReNew2(AlnMsg2Ptr amp);
278 
279 typedef struct am_small {
280    Int4  n1;
281    Int4  n2;
282    Int4  n3;
283    Int4  n4;
284    Int4  n5;
285    struct am_small PNTR next;
286 } AM_Small2, PNTR AM_Small2Ptr;
287 
288 typedef struct am_consistset {
289    Int4         numrows;
290    Int4Ptr      starts;
291    Int4Ptr      stops;
292    Uint1Ptr     strands;
293    SeqAlignPtr  sap;
294    Int4         used;
295    Int4Ptr      which;
296    struct am_consistset PNTR next;
297 } AMConsSet, PNTR AMConsSetPtr;
298 
299 typedef struct am_coreinf {
300    Int4      start_core;
301    Int4      len;
302    Int4      sap_num;
303    Int4      row;
304    SeqIdPtr  sip;
305    Int4      start_aln;
306    Int4      left;
307    Int4      right;
308 } AM_Core, PNTR AM_CorePtr;
309 
310 typedef struct am_rowinf {
311    Int4  from;
312    Int4  len;
313    struct am_rowinf PNTR next;
314 } AMRowInfo, PNTR AMRowInfoPtr; 
315 
316 typedef struct am_condenserow {
317    SeqIdPtr  sip;
318    Uint1     strand;
319    Int4      rownum;
320 } AMCdRow, PNTR AMCdRowPtr;
321 
322 typedef struct am_interval {
323    Int4   from;
324    Int4   to;
325    Uint1  strand;
326    struct am_interval PNTR next;
327 } AMInterval, PNTR AMIntervalPtr;
328 
329 typedef struct am_intervalset {
330    SeqIdPtr       sip;
331    AMIntervalPtr  int_head;
332    Uint1          strand;
333    struct am_intervalset PNTR next;
334 } AMIntervalSet, PNTR AMIntervalSetPtr;
335 
336 typedef struct am_edge {
337    Int4         vertex1;
338    Int4         vertex2;
339    Int4         weight;
340    Int4         used;
341    Boolean      aligned;
342    SeqAlignPtr  sap;
343    struct am_edge PNTR next;
344 } AMEdge, PNTR AMEdgePtr;
345 
346 typedef struct am_vertex {
347    SeqIdPtr  sip;
348    Int4      from;
349    Int4      to;
350    Uint1     strand;
351    Int4      numedges;
352    Boolean   used;
353    Boolean   visited;
354    struct am_vertex PNTR next;
355 } AMVertex, PNTR AMVertexPtr;
356 
357 typedef struct am_queue {
358    AMVertexPtr  vertex;
359    struct am_queue PNTR next;
360 } AMQueue, PNTR AMQueuePtr;
361 
362 typedef struct am_segment {
363    Int4         len;
364    Int4         which_row;
365    SeqAlignPtr  sap;
366    Int4         aligncoord;
367    struct am_segment PNTR next;
368 } AMSegment, PNTR AMSegmentPtr;
369 
370 typedef struct am_rowstart {
371    AMSegmentPtr  segment;
372    SeqIdPtr      sip;
373    struct am_rowstart PNTR next;
374 } AMRowStart, PNTR AMRowStartPtr;
375 
376 typedef struct am_frequency {
377    Int4Ptr  PNTR freq;
378    Int4     len;  /* second dimension */
379    Int4     size; /* first dimension */
380    Boolean  isna;
381 } AMFreq, PNTR AMFreqPtr;
382 
383 typedef struct am_bit {
384    Int4  n;
385    Int4  num1;
386    Int4  num2;
387    Int4  num3;
388    struct am_bit PNTR next;
389 } AMBitty2, PNTR AMBitty2Ptr;
390 
391 /***************************************************************************
392 *
393 *  SECTION 1: Functions for allocating and freeing data structures used
394 *  by the alignment manager; copying functions are also here.
395 *
396 ***************************************************************************/
397 NLM_EXTERN void AMFreqFree(AMFreqPtr afp);
398 NLM_EXTERN void AMAlignIndexFreeEitherIndex(SeqAlignPtr sap);
399 NLM_EXTERN SeqAlignPtr AlnMgr2DupAlnAndIndexes(SeqAlignPtr sap);
400 
401 /***************************************************************************
402 *
403 *  SECTION 2: Functions used to create the indexes for parent and child
404 *  seqaligns.
405 *    SECTION 2a: Functions to create indexes for child seqaligns, and
406 *                to convert seqaligns to dense-seg type
407 *    SECTION 2b: Functions to unpack and rearrange complicated seqaligns
408 *                into simple chains of dense-seg and dense-diag types
409 *    SECTION 2c: Functions to create indexes for parent seqaligns
410 *    SECTION 2d: Accessory functions for parent indexing
411 *
412 ***************************************************************************/
413 /* SECTION 2a */
414 /***************************************************************************
415 *
416 *  AlnMgr2IndexSingleChildSeqAlign takes a simple dense-seg or dense-diag
417 *  seqalign, converts it to dense-seg, and then calls
418 *  AlnMgr2IndexSingleDenseSegSA to create the indexes. If the alignment has
419 *  already been indexed, this erases that index and reindexes the alignment.
420 *
421 ***************************************************************************/
422 NLM_EXTERN Boolean AlnMgr2IndexSingleChildSeqAlign(SeqAlignPtr sap);
423 
424 /* SECTION 2c */
425 /***************************************************************************
426 *
427 *  AlnMgr2IndexLite takes a seqalign or a list of seqaligns, converts
428 *  each alignment to a dense-seg structure and indexes it, and then
429 *  allocates an AMAlignIndex2 structure and fills in the saps array.
430 *
431 ***************************************************************************/
432 NLM_EXTERN Boolean AlnMgr2IndexLite(SeqAlignPtr sap);
433 
434 /***************************************************************************
435 *
436 *  AlnMgr2IndexSeqAlign takes a seqalign of any type except std-seg and
437 *  creates indexes on it for easy retrieval of useful information by other
438 *  AlnMgr2 functions. If the seqalign is a single alignment, that alignment
439 *  gets a simple index and is left alone otherwise. If the seqalign is
440 *  a set of alignments or a dense-diag set, the subalignments get
441 *  individually indexed and then are combined into a (fake) multiple
442 *  alignment which also gets indexed. The subalignments can now be accessed
443 *  as a multiple alignment by AlnMgr2 functions.
444 *
445 ***************************************************************************/
446 NLM_EXTERN void AlnMgr2IndexSeqAlign(SeqAlignPtr sap);
447 NLM_EXTERN void AlnMgr2IndexSeqAlignEx(SeqAlignPtr sap, Boolean replace_gi);
448 
449 /***************************************************************************
450 *
451 *  AlnMgr2ReIndexSeqAlign takes an indexed alignment (that has, presumably,
452 *  been changed), makes sure all child seqaligns are indexed (if they are
453 *  already indexed they are not reindexed), and reindexes all the child
454 *  seqaligns as a set.
455 *
456 ***************************************************************************/
457 NLM_EXTERN void AlnMgr2ReIndexSeqAlign(SeqAlignPtr sap);
458 
459 NLM_EXTERN Boolean AlnMgr2IndexAsRows(SeqAlignPtr sap, Uint1 strand, Boolean truncate);
460 
461 /***************************************************************************
462 *
463 *  AlnMgr2IndexIndexedChain takes a linked list of indexed seqaligns
464 *  and does an in-place transformation to an indexed parent-child
465 *  seqalign set.
466 *
467 ***************************************************************************/
468 NLM_EXTERN void AlnMgr2IndexIndexedChain(SeqAlignPtr sap);
469 
470 /***************************************************************************
471 *
472 *  SECTION 3: Functions for debugging
473 *
474 ***************************************************************************/
475 NLM_EXTERN void am_print_sa_index(SeqAlignPtr sap, FILE *ofp);
476 NLM_EXTERN void AlnMgr2PrintSeqAlign(SeqAlignPtr sap, Int4 linesize, Boolean isnuc, FILE * ofp);
477 
478 /***************************************************************************
479 *
480 *  SECTION 4: API-level functions (and their helper functions) used to
481 *  access an indexed alignment.
482 *    SECTION 4a: AlnMgr2GetNextAlnBit and associated functions
483 *    SECTION 4b: "GetNth" functions
484 *    SECTION 4c: other functions for accessing the alignment
485 *
486 ***************************************************************************/
487 /* SECTION 4a */
488 /***************************************************************************
489 *
490 *  AlnMgr2GetNextAlnBit takes an indexed seqalign and returns it, piece
491 *  by piece, in the row and across the range specified in the AlnMsg
492 *  structure. amp->from_aln and amp->to_aln must be filled in; these are
493 *  in alignment coordinates. AlnMgr2GetNextAlnBit will return the AlnMsg
494 *  structure with amp->from_row and amp->to_row filled in. If amp->type is
495 *  AM_SEQ, these numbers are sequence coordinates; if amp->type is AM_GAP
496 *  the numbers are alignment coordinates and there is a gap in that row.
497 *  AlnMgr2GetNextAlnBit returns one continuous piece of sequence or gap
498 *  at each call, and keeps returning TRUE until it has returned all the
499 *  information for the piece of the alignment requested.
500 *
501 ***************************************************************************/
502 NLM_EXTERN Boolean AlnMgr2GetNextAlnBit(SeqAlignPtr sap, AlnMsg2Ptr amp);
503 
504 /* SECTION 4a */
505 /***************************************************************************
506 *
507 *  AlnMgr2GetInterruptInfo returns a structure describing the inserts and
508 *  unaligned regions in an interrupt. The structure is allocated by this
509 *  function and must be freed with AlnMgr2FreeInterruptInfo.
510 *
511 ***************************************************************************/
512 NLM_EXTERN AMInterrInfoPtr AlnMgr2GetInterruptInfo(SeqAlignPtr sap, AMInterruptPtr interrupt);
513 
514 /* SECTION 4b */
515 /***************************************************************************
516 *
517 *  AlnMgr2GetNthStrand takes an indexed seqalign and a row number and 
518 *  returns the strand of the row indicated. A return of 0 indicates
519 *  an error.
520 *
521 ***************************************************************************/
522 NLM_EXTERN Uint1 AlnMgr2GetNthStrand(SeqAlignPtr sap, Int4 n);
523 
524 /***************************************************************************
525 *
526 *  AlnMgr2GetNthSeqIdPtr returns the seqid (this is a duplicated,
527 *  allocated seqid that must be freed) of the nth row (1-based) of an
528 *  indexed parent or child seqalign.
529 *
530 ***************************************************************************/
531 NLM_EXTERN SeqIdPtr AlnMgr2GetNthSeqIdPtr(SeqAlignPtr sap, Int4 n);
532 
533 /***************************************************************************
534 *
535 *  AlnMgr2GetNthSeqRangeInSA returns the smallest and largest sequence
536 *  coordinates contained in the nth row of an indexed seqalign. Either
537 *  start or stop can be NULL to only retrieve one of the coordinates.
538 *  If start and stop are -1, there is an error; if they are both -2, the
539 *  row is just one big insert. RANGE
540 *
541 ***************************************************************************/
542 NLM_EXTERN void AlnMgr2GetNthSeqRangeInSA(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
543 
544 /***************************************************************************
545 *
546 *  AlnMgr2GetNthRowSpanInSA returns the least and greatest alignment
547 *  coordinates (inclusive) spanned by the indicated row. Either stop or
548 *  start can be NULL to retrieve just one of the coordinates.
549 *
550 ***************************************************************************/
551 NLM_EXTERN void AlnMgr2GetNthRowSpanInSA(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
552 
553 NLM_EXTERN Int4 AlnMgr2GetMaxTailLength(SeqAlignPtr sap, Uint1 which_tail);
554 
555 /***************************************************************************
556 *
557 *  AlnMgr2GetNthRowTail returns the sequence extremities that are not
558 *  contained in the alignment (if the alignment starts at 10 in row 2, the
559 *  tail in that row is 0-9). It takes an indexed seqalign, a 1-based row
560 *  number, and AM2_LEFT_TAIL or AM2_RIGHT_TAIL, and returns the start, stop,
561 *  and strand of the tail indicated in the row desired. AlnMgr2GetNthRowTail
562 *  returns TRUE if the calculations were successfully completed.
563 *
564 ***************************************************************************/
565 NLM_EXTERN Boolean AlnMgr2GetNthRowTail(SeqAlignPtr sap, Int4 n, Uint1 which_tail, Int4Ptr start, Int4Ptr stop, Uint1Ptr strand);
566 
567 /* SECTION 4c */
568 /***************************************************************************
569 *
570 *  AlnMgr2GetAlnLength returns the total alignment length of an indexed
571 *  alignment. If fill_in is TRUE, the function computes the total length
572 *  of all the internal unaligned regions and adds that to the alignment
573 *  length; otherwise only the aligned portions are considered.
574 *
575 ***************************************************************************/
576 NLM_EXTERN Int4 AlnMgr2GetAlnLength(SeqAlignPtr sap, Boolean fill_in);
577 
578 /* SECTION 4c functions for DDV */
579 NLM_EXTERN Boolean AlnMgr2IsSAPDiscAli(SeqAlignPtr sap);
580 NLM_EXTERN Int4 AlnMgr2GetNumAlnBlocks(SeqAlignPtr sap);
581 NLM_EXTERN Boolean AlnMgr2GetNthBlockRange(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
582 
583 /***************************************************************************
584 *
585 *  AlnMgr2GetNthUnalignedForNthRow returns the bioseq coordinates for the
586 *  requested row, in the requested unaligned region. Any error will result
587 *  in -1 returns for both start and stop.
588 *
589 ***************************************************************************/
590 NLM_EXTERN Boolean AlnMgr2GetNthUnalignedForNthRow(SeqAlignPtr sap, Int4 unaligned, Int4 row, Int4Ptr start, Int4Ptr stop);
591 
592 /***************************************************************************
593 *
594 *  AlnMgr2GetNextLengthBit is called in a loop on an indexed alignment, with
595 *  seg starting at 0, to return the lengths of the aligned and unaligned
596 *  regions. If the length returned is negative, it's an unaligned region;
597 *  otherwise it's aligned.
598 *
599 ***************************************************************************/
600 NLM_EXTERN Boolean AlnMgr2GetNextLengthBit(SeqAlignPtr sap, Int4Ptr len, Int4Ptr seg);
601 
602 /***************************************************************************
603 *
604 *  AlnMgr2GetNumRows returns the number of rows in an indexed seqalign.
605 *
606 ***************************************************************************/
607 NLM_EXTERN Int4 AlnMgr2GetNumRows(SeqAlignPtr sap);
608 
609 /***************************************************************************
610 *
611 *  AlnMgr2GetNumSegs returns the number of gap- or aligned- contiguous
612 *  segments in the alignment (continuous or not).
613 *
614 ***************************************************************************/
615 NLM_EXTERN Int4 AlnMgr2GetNumSegs(SeqAlignPtr sap);
616 
617 /***************************************************************************
618 *
619 *  AlnMgr2GetNumSegsInRange returns the number of alignment segments
620 *  spanned by the given range (partially or fully). The range is
621 *  given in alignment coordinates.
622 *
623 ***************************************************************************/
624 NLM_EXTERN Int4 AlnMgr2GetNumSegsInRange(SeqAlignPtr sap, Int4 start, Int4 stop, Int4Ptr start_seg);
625 
626 /***************************************************************************
627 *
628 *  AlnMgr2GetNthSegmentRange returns the alignment coordinate range of the
629 *  Nth segment (count starts at 1) of the seqalign. start and stop are
630 *  optional arguments (in case only one end is desired).
631 *
632 ***************************************************************************/
633 NLM_EXTERN void AlnMgr2GetNthSegmentRange(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
634 
635 /***************************************************************************
636 *
637 *  AlnMgr2GetFirstNForSip returns the first row that a seqid occurs on,
638 *  or -1 if the seqid is not in the alignment or if there is another
639 *  error.
640 *
641 ***************************************************************************/
642 NLM_EXTERN Int4 AlnMgr2GetFirstNForSip(SeqAlignPtr sap, SeqIdPtr sip);
643 
644 /***************************************************************************
645 *
646 *  AlnMgr2GetFirstNForSipList returns the first row that one of a list of seqids occur on,
647 *  or -1 if none of the seqids are in the alignment or if there is another
648 *  error. 
649 *  Handy if sip comes from a BioSeq, where it can point to a linked list
650 *  of SeqIds.
651 *
652 ***************************************************************************/
653 NLM_EXTERN Int4 AlnMgr2GetFirstNForSipList(SeqAlignPtr sap, SeqIdPtr sip);
654 
655 /***************************************************************************
656 *
657 *  AlnMgr2GetParent returns the top-level seqalign associated with a given
658 *  indexed alignment. It returns the actual pointer, not a copy.
659 *
660 ***************************************************************************/
661 NLM_EXTERN SeqAlignPtr AlnMgr2GetParent(SeqAlignPtr sap);
662 
663 /***************************************************************************
664 *
665 *  SECTION 5: Functions to change, assign or retrieve an anchor row.
666 *    SECTION 5a: functions for child seqaligns
667 *    SECTION 5b: functions for parent seqaligns
668 *    SECTION 5c: functions to retrieve anchor row information
669 *
670 ***************************************************************************/
671 /* SECTION 5b */
672 
673 /***************************************************************************
674 *
675 *  AlnMgr2AnchorSeqAlign takes an indexed seqalign and a row (1-based) and
676 *  reindexes the alignment so that there are no gaps in the row indicated.
677 *  Other rows may contain inserts after this operation. After an alignment
678 *  is anchored, its length often shrinks.
679 *
680 ***************************************************************************/
681 NLM_EXTERN void AlnMgr2AnchorSeqAlign(SeqAlignPtr sap, Int4 which_row);
682 
683 /* SECTION 5c */
684 /***************************************************************************
685 *
686 *  AlnMgr2FindAnchor returns the row number (1-based) of the anchor row
687 *  for an indexed seqalign, or -1 if the alignment is unanchored or if
688 *  there is another type of error.
689 *
690 ***************************************************************************/
691 NLM_EXTERN Int4 AlnMgr2FindAnchor(SeqAlignPtr sap);
692 
693 /***************************************************************************
694 *
695 *  SECTION 6: Functions for coordinate conversion (bioseq to seqalign
696 *  coordinates and vice versa)
697 *
698 ***************************************************************************/
699 
700 /***************************************************************************
701 *
702 *  AlnMgr2MapBioseqToSeqAlign takes an indexed seqalign, a position in a
703 *  row of the alignment, and a 1-based row number, and maps the row position
704 *  to alignment coordinates.
705 *
706 ***************************************************************************/
707 NLM_EXTERN Int4 AlnMgr2MapBioseqToSeqAlign(SeqAlignPtr sap, Int4 pos, Int4 row);
708 
709 /***************************************************************************
710 *
711 *  AlnMgr2MapSeqAlignToBioseq takes an indexed seqalign, an alignment
712 *  coordinate (pos), and the 1-based number of a row, and maps the alignment
713 *  coordinate to the corresponding bioseq coordinate of the row desired.
714 *  A return of -1 indicates an error; a return of -2 means that the bioseq
715 *  is gapped at this alignment position.
716 *
717 ***************************************************************************/
718 NLM_EXTERN Int4 AlnMgr2MapSeqAlignToBioseq(SeqAlignPtr sap, Int4 pos, Int4 row);
719 
720 /***************************************************************************
721 *
722 *  AlnMgr2MapRowToRow takes an indexed seqalign, a position in row1, the
723 *  1-based number of row1, and a target row (row2), and maps the bioseq
724 *  coordinate in row 1 to the corresponding (aligned) bioseq coordinate in
725 *  row2. A return of -1 indicates an error while a return of -2 means that
726 *  the bioseq in row2 is gapped at the desired position.
727 *
728 ***************************************************************************/
729 NLM_EXTERN Int4 AlnMgr2MapRowToRow(SeqAlignPtr sap, Int4 pos, Int4 row1, Int4 row2);
730 
731 /***************************************************************************
732 *
733 *  SECTION 7: Functions to change an alignment and retrieve parts of an
734 *    alignment
735 *
736 ***************************************************************************/
737 /***************************************************************************
738 *
739 *   AlnMgr2TruncateSeqAlign truncates a given seqalign to contain only the
740 *   bioseq coordinates from start to stop on the indicated row.  Anything
741 *   before those coordinates is discarded; anything remaining afterwards
742 *   is made into another seqalign and put in sap->next (the original next,
743 *   if any, is now at sap->next->next).  Doesn't work on parent seqaligns.
744 *   The function returns TRUE if the orignal alignment extended past stop.
745 *
746 ***************************************************************************/
747 NLM_EXTERN Boolean AlnMgr2TruncateSeqAlign(SeqAlignPtr sap, Int4 start, Int4 stop, Int4 row);
748 
749 /***************************************************************************
750 *
751 *  AlnMgr2GetSubAlign retrieves a portion of an indexed alignment, from
752 *  'from' to 'to' in the row coordinates specified, or if which_row is 0,
753 *  'from' and 'to' are assumed to be alignment coordinates. If 'to' is -1,
754 *  the subalignment will go to the end of the specified row (or to the end
755 *  of the whole alignment). If the alignment is discontinuous and fill_in
756 *  is FALSE, the alignment will be returned as an SAS_DISC set, each piece
757 *  represented by a single alignment. If the alignment is discontinuous and
758 *  fill_in is TRUE, the unaligned regions will be added in to the alignment,
759 *  with all gaps in all other rows. If the alignment is continuous, it
760 *  doesn't matter whether fill_in is TRUE or FALSE. (SUBALIGN)
761 *
762 ***************************************************************************/
763 NLM_EXTERN SeqAlignPtr AlnMgr2GetSubAlign(SeqAlignPtr sap, Int4 from, Int4 to, Int4 which_row, Boolean fill_in);
764 
765 /***************************************************************************
766 *
767 *  SECTION 8: Miscellaneous functions to compute useful information
768 *    about an alignment
769 *
770 ***************************************************************************/
771 /***************************************************************************
772 *
773 *  AlnMgr2ComputeScoreForSeqAlign computes an ad hoc numerical score for
774 *  an indexed alignment by computing a similarity score for the whole
775 *  alignment (residue pair by residue pair score, from a matrix for proteins
776 *  and identity for nucleotides) and then subtracting gap open and gap
777 *  extension penalties.
778 *
779 ***************************************************************************/
780 NLM_EXTERN Int4 AlnMgr2ComputeScoreForSeqAlign(SeqAlignPtr sap);
781 
782 /***************************************************************************
783 *
784 *  AlnMgr2ComputeFreqMatrix takes an indexed seqalign and returns a matrix
785 *  indicating nucleotide or amino acid frequency at each position of the
786 *  alignment. The matrix can be made over only a part of the alignment, if
787 *  from and to are nonzero, and if row is nonzero, from and to are taken
788 *  to be bioseq coordinates from that row (if row == 0 from and to are
789 *  assumed to be alignment coordinates).
790 *
791 ***************************************************************************/
792 NLM_EXTERN AMFreqPtr AlnMgr2ComputeFreqMatrix(SeqAlignPtr sap, Int4 from, Int4 to, Int4 row);
793 
794 /***************************************************************************
795 *
796 *  AlnMgr2IsItProtein takes an indexed alignment and quickly decides if
797 *  it's a protein or nucleotide alignment, returning TRUE for protein.
798 *
799 ***************************************************************************/
800 NLM_EXTERN Boolean AlnMgr2IsItProtein(SeqAlignPtr sap);
801 
802 /***************************************************************************
803 *
804 *  SECTION 9: Sorting functions and other algorithms to help order
805 *  alignments for various purposes
806 *
807 ***************************************************************************/
808 /***************************************************************************
809 *
810 *  AlnMgr2SortAlnSetByNthRowPos takes an indexed parent alignment and sorts
811 *  all the child alignments along the row indicated. If the indicated row
812 *  is aligned on the plus strand, the alignments are sorted from smaller
813 *  to larger coordinates along that row; otherwise they are sorted in
814 *  reverse order.
815 *
816 ***************************************************************************/
817 NLM_EXTERN void AlnMgr2SortAlnSetByNthRowPos(SeqAlignPtr sap, Int4 row);
818 
819 /***************************************************************************
820 *
821 *  SECTION 10: Basic alignment operations
822 *
823 ***************************************************************************/
824 
825 /***************************************************************************
826 *
827 *  AlnMgr2MergeTwoAlignments takes two alignments, with identical rows in
828 *  the same order (otherwise it rejects the alignments), and merges them
829 *  into a single alignment. If there is unaligned space between the two
830 *  alignments and this space is the same length for every row, the function
831 *  aligns those sequences; it rejects alignments when the unaligned spaces
832 *  are different sizes. The function returns a newly allocated alignment.
833 *
834 ***************************************************************************/
835 NLM_EXTERN SeqAlignPtr AlnMgr2MergeTwoAlignments(SeqAlignPtr sap1_orig, SeqAlignPtr sap2_orig);
836 
837 /***************************************************************************
838 *
839 *  AlnMgr2ExtendToCoords takes an indexed child seqalign and blindly extends
840 *  it to the coordinates specified on the given row. If other rows are too
841 *  short to allow this extension, the alignment is extended as far as
842 *  possible. If to == -1 the extension goes to the end of the sequence
843 *  specified.
844 *
845 ***************************************************************************/
846 NLM_EXTERN void AlnMgr2ExtendToCoords(SeqAlignPtr sap, Int4 from, Int4 to, Int4 row);
847 
848 /***************************************************************************
849 *
850 *  AlnMgr2PadConservatively extends an alignment so that the whole of
851 *  all sequences is included. If two sequences have tails on the same
852 *  side, they are each aligned with columns of all gaps:
853 *
854 *   <-new aln region->
855 *   xxxxxxxx----------xxxxxxxxxxxxxxxxxxxx
856 *   --------xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
857 *
858 *  This function returns a newly allocated alignment and doesn't change
859 *  the original (except for indexing). If the extension was not done for
860 *  some reason, the function returns NULL;
861 *
862 ***************************************************************************/
863 NLM_EXTERN SeqAlignPtr AlnMgr2PadConservatively(SeqAlignPtr sap);
864 
865 /***************************************************************************
866 *
867 *  AlnMgr2ExtractPairwiseSeqAlign takes an indexed alignment (parent or
868 *  child, but must be fully indexed, not lite) and extracts a pairwise
869 *  subalignment containing the two requested rows. The subalignment is
870 *  unindexed and may have internal unaligned regions.
871 *
872 ***************************************************************************/
873 NLM_EXTERN SeqAlignPtr AlnMgr2ExtractPairwiseSeqAlign(SeqAlignPtr sap, Int4 n1, Int4 n2);
874 
875 /***************************************************************************
876 *
877 *  AlnMgr2RemoveInconsistentAlnsFromSet takes an alignment that is
878 *  indexed at least at the AM2_LITE level, and prunes the child
879 *  alignments so that the remaining alignments form a consistent, 
880 *  nonoverlapping set. All alignments must have the same number of rows,
881 *  and they must be the same rows (although not necessarily in the same
882 *  order). The function uses a simple greedy algorithm to construct the
883 *  nonoverlapping set, starting with the highest-scoring alignment.
884 *  If fuzz is negative, the function creates the best nonoverlapping set
885 *  by actually truncating alignments.
886 *
887 ***************************************************************************/
888 NLM_EXTERN void AlnMgr2RemoveInconsistentAlnsFromSet(SeqAlignPtr sap_head, Int4 fuzz);
889 
890 /***************************************************************************
891 *
892 *  AlnMgr2FuseSet takes a set of alignments sharing all their rows and orders
893 *  the alignments, then fuses together any adjacent alignments. If returnall
894 *  is TRUE, all pieces are returned; if not, then only the largest piece is
895 *  returned. This function will work best when called after
896 *  AlnMgr2RemoveInconsistentAlnsFromSet(sap_head, -1).
897 *
898 ***************************************************************************/
899 NLM_EXTERN SeqAlignPtr AlnMgr2FuseSet(SeqAlignPtr sap_head, Boolean returnall);
900 
901 /* SECTION 11 -- functions for std-segs */
902 NLM_EXTERN Int4 AlignMgr2GetFirstNForStdSeg(SeqAlignPtr sap, SeqIdPtr sip);
903 NLM_EXTERN SeqIdPtr AlnMgr2GetNthSeqIdPtrStdSeg(SeqAlignPtr sap, Int4 n);
904 NLM_EXTERN void AlnMgr2GetNthSeqRangeInSAStdSeg(SeqAlignPtr sap, Int4 n, Int4Ptr start, Int4Ptr stop);
905 
906 /***************************************************************************
907 *
908 *   AlnMgr2GetSeqRangeForSipInSAStdSeg  returns the smallest and largest sequence
909 *  coordinates in in a Std-Seg seqalign for a given Sequence Id.  Also return the 
910 *  strand type if it is the same on every segment, else set it to Seq_strand_unknown.
911 *  Either start, stop or strand can be NULL to only retrieve some of them.
912 *  If start and stop are -1, there is an error (not a std-seg), the SeqID does not participate in this
913 *  alignment or the alignment is one big insert on that id.  Returns true if the sip was found
914 *  in the alignment with real coordinates, i.e. *start would not be -1.  RANGE
915 *
916 ***************************************************************************/
917 NLM_EXTERN Boolean AlnMgr2GetSeqRangeForSipInSAStdSeg(SeqAlignPtr sap, SeqIdPtr sip, Int4Ptr start, Int4Ptr stop, Uint1Ptr strand);
918 
919 /***************************************************************************
920 *
921 *   AlnMgr2GetSeqRangeForSipInStdSeg  returns the start and stop sequence
922 *  coordinates in a Std-Segment for a given Sequence Id.  Also return the 
923 *  strand type.  Either start, stop, strand or segType can be NULL to only retrieve some of them.
924 *  Returns false if the SeqID was not found in this segment, so no meaningful 
925 *    data was passed back in other arguments.  
926 *  Returns true if the sip was found, even if it is a gap (start, stop = -1).
927 *  segType is set to AM_SEQ if the SeqID Sequence is not empty and one of 
928 *  the other sequences aligned with it is also not empty.  To AM_GAP if
929 *  the other sequences are all empty, and to AM_INSERT if the main sequence
930 *  is empty. 
931 *  RANGE
932 *
933 ***************************************************************************/
934 NLM_EXTERN Boolean AlnMgr2GetSeqRangeForSipInStdSeg(
935     StdSegPtr   ssp, 
936     SeqIdPtr    sip, 
937     Int4Ptr     start, 
938     Int4Ptr     stop, 
939     Uint1Ptr    strand,
940     Uint1Ptr    segType); /* AM_SEQ, AM_GAP, AM_INSERT */
941 
942 /***************************************************************************
943 *
944 *   AlnMgr2GetNthStdSeg  returns the a pointer to the Nth segment of
945 *   a standard segment alignment.
946 *   returns NULL if not n segments or is not a std-seg aligment.
947 *   Useful to pass its return value to AlnMgr2GetSeqRangeForSipInStdSeg()
948 *
949 ***************************************************************************/
950 NLM_EXTERN StdSegPtr AlnMgr2GetNthStdSeg(SeqAlignPtr sap, Int2 n);
951 
952 /***************************************************************************
953 *
954 *  AlnMgr2GetNumStdSegs returns the number of segments in a standar-seg alignment.
955 *   returns -1 if sap is null or not a standard-seg alignment.
956 *   the Std-Seg version of AlnMgr2GetNumSegs
957 *
958 ***************************************************************************/
959 NLM_EXTERN Int4 AlnMgr2GetNumStdSegs(SeqAlignPtr sap);
960 
961 /***************************************************************************
962 *
963 *  The two mapping functions act a little differently for std-segs. The
964 *  alignment coordinates are 1:1 linearly correlated with the longest
965 *  seqloc in the set; the others may be significantly shorter.
966 *  The mapping functions deal with % lengths, and map those instead of
967 *  coordinates (which may not be linear);
968 *
969 ***************************************************************************/
970 NLM_EXTERN Int4 AlnMgr2MapBioseqToSeqAlignStdSeg(SeqAlignPtr sap, Int4 n, Int4 pos);
971 NLM_EXTERN Int4 AlnMgr2MapSeqAlignToBioseqStdSeg(SeqAlignPtr sap, Int4 n, Int4 pos);
972 NLM_EXTERN Int4 AlnMgr2GetAlnLengthStdSeg(SeqAlignPtr sap);
973 
974 /***************************************************************************/
975 
976 
977 #ifdef __cplusplus
978 }
979 #endif
980 
981 #undef NLM_EXTERN
982 #ifdef NLM_EXPORT
983 #define NLM_EXTERN NLM_EXPORT
984 #else
985 #define NLM_EXTERN
986 #endif
987 
988 #endif
989 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.