|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/api/alignval.c |
source navigation diff markup identifier search freetext search file search |
1 /* alignval.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: alignval.c
27 *
28 * Author: Jian Ye, Colombe Chappey
29 *
30 * Version Creation Date: 6/3/99
31 *
32 * $Revision: 6.73 $
33 *
34 * File Description: To validate sequence alignment.
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45
46 #include <ncbi.h>
47 #include <seqmgr.h>
48 #include <objmgr.h>
49 #include <sequtil.h>
50 #include <sqnutils.h>
51 #include <satutil.h>
52 #include <salsap.h>
53 #include <txalign.h>
54 #include <salpacc.h>
55 #include <alignval.h>
56 #include <valid.h>
57 #include <alignmgr2.h>
58
59
60 Uint1 jybitnum[8]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
61
62 typedef struct saval {
63 Boolean message;
64 Boolean msg_success;
65 Boolean find_remote_bsp;
66 Boolean find_acc_bsp;
67 Boolean delete_salp;
68 Boolean delete_bsp;
69 Boolean retdel;
70 Boolean do_hist_assembly;
71 ValNodePtr ids;
72 Uint2 entityID;
73 Boolean dirty;
74 } SaVal, PNTR SaValPtr;
75
76 typedef struct JY_error_msg {
77 Uint1 level;/* corresponds to levels of ErrPostEx [none(0), info(1), war
78 n(2), error(3) and fatal(4)] */
79 CharPtr msg;
80 } JYErrorMsg, *JYErrorMsgPtr;
81
82 /******************************************************************
83 ***
84 *** Error Messaging
85 *** copies of the BLASt functions in blastpri.h
86 *** JYConstructErrorMessage = BlastConstructErrorMessage
87 *** JYErrorChainDestroy = BlastErrorChainDestroy
88 ***
89 ******************************************************************/
90
91 static ValNodePtr errorp = NULL;
92 #define BUFFER_LENGTH 512
93
94 static Uint2 AlignmentPercentIdentityEx (SeqAlignPtr salp, Boolean internal_gaps, Boolean internal_validation);
95
96 static ValNodePtr JYConstructErrorMessage (CharPtr function, CharPtr message, Uint1 level, ValNodePtr PNTR vnpp)
97 {
98 Char buffer[BUFFER_LENGTH];
99 CharPtr ptr;
100 JYErrorMsgPtr error_msg;
101
102 if (vnpp == NULL)
103 return NULL;
104
105 buffer[0] = NULLB;
106 ptr = buffer;
107 if (function != NULL)
108 {
109 sprintf(buffer, "%s: ", function);
110 ptr = buffer + StringLen(buffer);
111 }
112
113 if (message != NULL)
114 {
115 sprintf(ptr, "%s", message);
116 }
117
118 error_msg = (JYErrorMsgPtr) MemNew(sizeof(JYErrorMsg));
119 error_msg->msg = StringSave(buffer);
120 error_msg->level = level;
121
122 ValNodeAddPointer(vnpp, 0, error_msg);
123
124 return *vnpp;
125 }
126
127 static ValNodePtr JYErrorChainDestroy (ValNodePtr vnp)
128
129 {
130 ValNodePtr start = vnp;
131 JYErrorMsgPtr error_msg;
132
133 while (vnp)
134 {
135 error_msg = (JYErrorMsgPtr) vnp->data.ptrvalue;
136 if (error_msg != NULL) {
137 MemFree(error_msg->msg);
138 }
139 vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
140 vnp = vnp->next;
141 }
142
143 ValNodeFree(start);
144
145 return NULL;
146 }
147 /******************************************************************
148 Output error message according to code defined in alignval.h.
149 id refers to seqid of the sequence that causes the error
150 and idcontext refers to other sequences in the same segment.
151 Intvalue is used to indicate 1) the segment where the sequence
152 with error is, or 2) the segtype in case of segtype error.
153 Please note that not all errors report all three
154 parameters(id, idcontext, Intvalue)
155 ******************************************************************/
156
157 static Boolean useValErr = FALSE;
158 static Boolean useLockByID = FALSE;
159 static ValidStructPtr useVsp = NULL;
160
161 static BioseqPtr AlignValBioseqLockById (SeqIdPtr sid)
162
163 {
164 Int4 old_sev;
165 BioseqPtr bsp = NULL;
166
167 if (useLockByID) {
168 old_sev = ErrSetMessageLevel (SEV_WARNING);
169 bsp = BioseqLockById (sid);
170 ErrSetMessageLevel ((ErrSev) old_sev);
171 } else {
172 bsp = BioseqFindCore (sid);
173 }
174 return bsp;
175 }
176
177 static Boolean AlignValBioseqUnlock (BioseqPtr bsp)
178
179 {
180 if (useLockByID) {
181 return BioseqUnlock (bsp);
182 } else {
183 return TRUE;
184 }
185 }
186
187 NLM_EXTERN void CDECL ValidErr VPROTO((ValidStructPtr vsp, int severity, int code1, int code2, const char *fmt, ...));
188
189 /*****************************************************************
190 * get the approximate sequence coordinate for an alignment segment
191 * sip == NULL -> get alignment coordinate
192 *****************************************************************/
193 static Int4 valmsggetseqpos(SeqAlignPtr sap, Int4 segment, SeqIdPtr sip)
194 {
195 Int4 c;
196 DenseDiagPtr ddp;
197 DenseSegPtr dsp;
198 Boolean found;
199 Int4 i;
200 Int4 j;
201 Int4 pos;
202 PackSegPtr psp;
203 Uint1Ptr seqpresence;
204 SeqIdPtr sip_tmp;
205 SeqLocPtr slp;
206 StdSegPtr ssp;
207
208 if (sap == NULL || sap->segs == NULL || segment == 0)
209 return -1;
210 if (sap->segtype == SAS_DENSEG)
211 {
212 dsp = (DenseSegPtr)sap->segs;
213 if (sip == NULL)
214 {
215 pos = 0;
216 for (c=0; c<segment; c++)
217 {
218 pos += dsp->lens[c];
219 }
220 return pos;
221 }
222 sip_tmp = dsp->ids;
223 i = 0;
224 found = FALSE;
225 while (!found && sip_tmp != NULL)
226 {
227 if (SeqIdComp(sip, sip_tmp) == SIC_YES)
228 found = TRUE;
229 else
230 {
231 sip_tmp = sip_tmp->next;
232 i++;
233 }
234 }
235 if (!found || i>dsp->dim || segment > dsp->numseg)
236 return -1;
237 pos = 0;
238 for (c=0; c<segment; c++)
239 {
240 if ((j = dsp->starts[(dsp->dim*c)+i])>0)
241 pos=j;
242 }
243 return pos;
244 } else if (sap->segtype == SAS_DENDIAG)
245 {
246 ddp = (DenseDiagPtr)sap->segs;
247 pos = 0;
248 for (c=0; c<segment; c++)
249 {
250 pos += ddp->len;
251 ddp = ddp->next;
252 if (ddp == NULL)
253 return -1;
254 }
255 if (sip == NULL)
256 return pos;
257 sip_tmp = ddp->id;
258 i = 0;
259 found = FALSE;
260 while (!found && sip_tmp != NULL)
261 {
262 if (SeqIdComp(sip, sip_tmp) == SIC_YES)
263 found = TRUE;
264 else
265 {
266 sip_tmp = sip_tmp->next;
267 i++;
268 }
269 }
270 if (!found || i>ddp->dim)
271 return -1;
272 return (ddp->starts[i]);
273 } else if (sap->segtype == SAS_STD)
274 {
275 ssp = (StdSegPtr)(sap->segs);
276 pos = 0;
277 for (c=0; c<segment-1; c++)
278 {
279 pos += SeqLocLen(ssp->loc);
280 ssp = ssp->next;
281 if (ssp == NULL)
282 return -1;
283 }
284 if (sip == NULL)
285 return pos;
286 slp = ssp->loc;
287 found = FALSE;
288 while (!found && slp!=NULL)
289 {
290 sip_tmp = SeqLocId(slp);
291 if (SeqIdComp(sip, sip_tmp) == SIC_YES)
292 found = TRUE;
293 else
294 slp = slp->next;
295 }
296 if (!found)
297 return -1;
298 return (SeqLocStart(slp));
299 } else if (sap->segtype == SAS_PACKED)
300 {
301 psp = (PackSegPtr)(sap->segs);
302 if (segment > psp->numseg)
303 return -1;
304 if (sip == NULL)
305 {
306 pos = 0;
307 for (c=0; c<segment; c++)
308 {
309 pos += psp->lens[c];
310 }
311 return pos;
312 }
313 sip_tmp = psp->ids;
314 i = 0;
315 found = FALSE;
316 while (!found && sip_tmp != NULL)
317 {
318 if (SeqIdComp(sip, sip_tmp) == SIC_YES)
319 found = TRUE;
320 else
321 {
322 sip_tmp = sip_tmp->next;
323 i++;
324 }
325 }
326 if (!found || i>psp->dim)
327 return -1;
328 pos = 0;
329 seqpresence = NULL;
330 BSSeek(psp->present, 0, SEEK_SET);
331 seqpresence=MemNew(BSLen(psp->present));
332 if(!seqpresence)
333 return -1;
334 BSRead(psp->present, seqpresence, BSLen(psp->present));
335 for (c=0; c<segment; c++)
336 {
337 if (seqpresence[(c*psp->numseg+i)/8]&jybitnum[(c*psp->numseg+i)%8])
338 pos+=psp->lens[c];
339 }
340 return pos;
341 } else
342 return -1;
343 }
344
345
346 static BioseqPtr BioseqForAlignment (SeqAlignPtr salp)
347 {
348 Int4 row, num_rows;
349 BioseqPtr bsp = NULL;
350 SeqIdPtr sip;
351 SeqEntryPtr oldscope;
352 DenseDiagPtr ddp;
353
354 oldscope = SeqEntrySetScope (NULL);
355 /* NOTE - can't index DenseDiag chain during validation because we're examining the individual DenseDiags,
356 * and indexing converts it to DenseSegs.
357 */
358 if (salp->segtype == SAS_DENDIAG && salp->segs != NULL) {
359 ddp = (DenseDiagPtr) salp->segs;
360 while (bsp == NULL && ddp != NULL) {
361 for (sip = ddp->id; bsp == NULL && sip != NULL; sip = sip->next) {
362 bsp = BioseqFind (sip);
363 sip = sip->next;
364 }
365 ddp = ddp->next;
366 }
367 } else {
368 AlnMgr2IndexSingleChildSeqAlign(salp);
369 num_rows = AlnMgr2GetNumRows(salp);
370 for (row = 1; row <= num_rows && bsp == NULL; row++) {
371 sip = AlnMgr2GetNthSeqIdPtr(salp, row);
372 bsp = BioseqFind(sip);
373 }
374 }
375 SeqEntrySetScope (oldscope);
376 return bsp;
377 }
378
379
380 static void ValMessage (SeqAlignPtr salp, Int1 MessageCode, ErrSev errlevel, SeqIdPtr id, SeqIdPtr idcontext , Int4 Intvalue)
381 {
382
383 Char buf[256],
384 buf3[64],
385 string1[64],
386 string2[552];
387 GatherContextPtr gcp;
388 Int4 pos;
389
390 string1[0] = '\0';
391 string2[0] = '\0';
392 SeqIdWrite(id, buf, PRINTID_FASTA_LONG, sizeof(buf)-1);
393 switch(MessageCode)
394 {
395 case Err_SeqId:
396 sprintf(string1, "SeqId");
397 sprintf(string2, "The sequence corresponding to SeqId %s could not be found", buf);
398 break;
399
400 case Err_Strand_Rev:
401 pos = valmsggetseqpos(salp, Intvalue, id);
402 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
403 sprintf(string1, "Strand");
404 sprintf(string2, "The strand labels for SeqId %s are inconsistent across the alignment; the first inconsistent region is the %ld(th) region, near sequence position %ld, context %s", buf, (long) Intvalue, (long) pos, buf3);
405 break;
406
407 case Err_Denseg_Len_Start:
408 pos = valmsggetseqpos(salp, Intvalue, id);
409 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
410 sprintf(string1, "Start/Length");
411 sprintf(string2, "There is a problem with sequence %s, in segment %ld (near sequence position %ld), context %s: the segment is too long or short or the next segment has an incorrect start position", buf, (long) Intvalue, (long) pos, buf3);
412 break;
413
414 case Err_Start_Less_Than_Zero:
415 pos = valmsggetseqpos(salp, Intvalue, id);
416 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
417 sprintf(string1, "Start");
418 sprintf(string2, "Start point is less than zero in segment %ld (near sequence position %ld) for sequence ID: %s in the context of %s", (long) Intvalue, (long) pos, buf, buf3);
419 break;
420
421 case Err_Start_More_Than_Biolen:
422 pos = valmsggetseqpos(salp, Intvalue, id);
423 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
424 sprintf(string1, "Start");
425 sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
426 break;
427
428 case Err_End_Less_Than_Zero:
429 pos = valmsggetseqpos(salp, Intvalue, id);
430 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
431 sprintf(string1, "Length");
432 sprintf(string2, "End point is less than zero in segment %ld (near position %d) for sequence ID: %s in the context of %s. This could be a formatting error", (long) Intvalue, (int) pos,buf, buf3);
433 break;
434
435 case Err_End_More_Than_Biolen:
436 pos = valmsggetseqpos(salp, Intvalue, id);
437 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
438 sprintf(string1, "Length");
439 sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
440 break;
441
442 case Err_Len_Less_Than_Zero:
443 pos = valmsggetseqpos(salp, Intvalue, id);
444 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
445 sprintf(string1, "Length");
446 sprintf(string2, "Segment length is less than zero in segment %ld (near sequence position %ld) for sequence ID: %s in the context of %s. Look for extra characters in this segment or flanking segments", (long) Intvalue, (long) pos, buf, buf3);
447 break;
448
449 case Err_Len_More_Than_Biolen:
450 pos = valmsggetseqpos(salp, Intvalue, id);
451 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
452 sprintf(string1, "Length");
453 sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
454 break;
455
456 case Err_Sum_Len_Start:
457 pos = valmsggetseqpos(salp, Intvalue, id);
458 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
459 sprintf(string1, "Start");
460 sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence. Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
461 break;
462
463 case Err_SeqAlign_DimSeqId_Not_Match:
464 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
465 sprintf(string1, "SeqId");
466 sprintf(string2, "The Seqalign has more or fewer ids than the number of rows in the alignment (context %s). Look for possible formatting errors in the ids.", buf3);
467 break;
468
469 case Err_Segs_DimSeqId_Not_Match:
470 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
471 sprintf(string1, "SeqId");
472 sprintf(string2, "In segment %ld, there are more or fewer rows than there are seqids (context %s). Look for possible formatting errors in the ids.", (long) Intvalue, buf3);
473 break;
474
475 case Err_Fastalike:
476 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
477 sprintf(string1, "Fasta");
478 sprintf(string2, "This may be a fasta-like alignment for SeqId: %s in the context of %s", buf, buf3);
479 break;
480
481 case Err_Null_Segs:
482 sprintf(string1, "Segs");
483 sprintf(string2, "This alignment is missing all segments. This is a non-correctable error -- look for serious formatting problems.");
484 break;
485
486 case Err_Segment_Gap:
487 pos = valmsggetseqpos(salp, Intvalue, id);
488 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
489 sprintf(string1, "Segs");
490 sprintf(string2, "Segment %ld (near alignment position %ld) in the context of %s contains only gaps. Each segment must contain at least one actual sequence -- look for columns with all gaps and delete them.", (long) Intvalue + 1, (long) pos, buf3);
491 break;
492
493 case Err_Segs_Dim_One:
494 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
495 sprintf(string1, "Segs");
496 sprintf(string2, "Segment %ld apparently has only one sequence. Each portion of the alignment must have at least two sequences. context %s", (long) Intvalue, buf3);
497 break;
498
499 case Err_SeqAlign_Dim_One:
500 SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
501 sprintf(string1, "Dim");
502 sprintf(string2, "This seqalign apparently has only one sequence. Each alignment must have at least two sequences. context %s", buf3);
503 break;
504
505 case Err_Segtype :
506 sprintf(string1, "Segs");
507 sprintf(string2, "This alignment has a undefined or unsupported Seqalign segtype %ld", (long) Intvalue);
508 break;
509
510 case Err_Pcnt_ID :
511 sprintf(string1, "PercentIdentity");
512 sprintf(string2, "This alignment has a percent identity of %d%%", Intvalue);
513 break;
514
515 case Err_Short_Aln:
516 sprintf(string1, "ShortAln");
517 sprintf(string2, "This alignment is shorter than at least one non-farpointer sequence.");
518 break;
519
520 case Err_Unexpected_Alignment_Type:
521 sprintf(string1, "UnexpectedAlignmentType");
522 sprintf (string2, "This is not a DenseSeg alignment.");
523 break;
524
525 default:
526 break;
527 }
528 if (useValErr) {
529 if (salp != NULL && useVsp != NULL) {
530 gcp = useVsp->gcp;
531 if (gcp != NULL) {
532 gcp->entityID = salp->idx.entityID;
533 gcp->itemID = salp->idx.itemID;
534 gcp->thistype = salp->idx.itemtype;
535
536 useVsp->bsp = BioseqForAlignment(salp);
537 ValidErr (useVsp, errlevel, 6, MessageCode, "%s: %s", string1, string2);
538 }
539 }
540 return;
541 }
542 if (StringLen(string1) > 0)
543 errorp = JYConstructErrorMessage (string1, string2, errlevel, &errorp);
544 }
545
546
547 /******************************************************************
548 return the number of seqid
549 ******************************************************************/
550 static Int2 CountSeqIdInSip (SeqIdPtr sip)
551 {
552 Int2 numids=0;
553
554 while(sip)
555 {
556 numids++;
557 sip=sip->next;
558 }
559 return numids;
560 }
561
562 /*********************************************************/
563 static void delete_bioseqs (ValNodePtr ids, Uint2 entityID)
564 {
565 SeqEntryPtr sep_top;
566 SeqEntryPtr sep_del;
567 ValNodePtr vnp;
568 SeqIdPtr sip;
569 SeqLocPtr slp;
570 BioseqPtr bsp;
571 ObjMgrDataPtr omdptop;
572 ObjMgrData omdata;
573 Uint2 parenttype;
574 Pointer parentptr;
575
576 if (ids == NULL)
577 return;
578 sep_top = GetTopSeqEntryForEntityID (entityID);
579 SaveSeqEntryObjMgrData (sep_top, &omdptop, &omdata);
580 GetSeqEntryParent (sep_top, &parentptr, &parenttype);
581
582 vnp=ids;
583 while (vnp!=NULL)
584 {
585 sip = (SeqIdPtr) vnp->data.ptrvalue;
586 if (sip!=NULL) {
587 slp = (SeqLocPtr)ValNodeNew (NULL);
588 slp->choice = SEQLOC_WHOLE;
589 slp->data.ptrvalue = sip;
590 bsp = GetBioseqGivenSeqLoc (slp, entityID);
591 if (bsp!=NULL) {
592 sep_del=GetBestTopParentForData (entityID, bsp);
593 RemoveSeqEntryFromSeqEntry (sep_top, sep_del, FALSE);
594 }
595 slp->data.ptrvalue = NULL;
596 SeqLocFree (slp);
597 }
598 vnp=vnp->next;
599 }
600 SeqMgrLinkSeqEntry (sep_top, parenttype, parentptr);
601 RestoreSeqEntryObjMgrData (sep_top, omdptop, &omdata);
602 RenormalizeNucProtSets (sep_top, TRUE);
603
604 for (vnp=ids; vnp!=NULL; vnp=vnp->next) {
605 SeqIdFree ((SeqIdPtr) vnp->data.ptrvalue);
606 vnp->data.ptrvalue = NULL;
607 }
608 ValNodeFree (vnp);
609 return;
610 }
611
612
613 /******************************************************************
614 validate a SeqId
615 ******************************************************************/
616 static void ValidateSeqId (SeqIdPtr sip, SeqAlignPtr salp)
617 {
618 SeqIdPtr siptemp=NULL, sipnext;
619 BioseqPtr bsp=NULL;
620
621 for(siptemp=sip; siptemp!=NULL; siptemp=siptemp->next)
622 {
623 /*
624 bsp = AlignValBioseqLockById(siptemp);
625 if(!bsp)
626 ValMessage (salp, Err_SeqId, SEV_ERROR, siptemp, NULL, 0);
627 else
628 AlignValBioseqUnlockById(siptemp);
629 */
630 sipnext = siptemp->next;
631 siptemp->next = NULL;
632 bsp = BioseqFindCore (siptemp);
633 if (bsp == NULL && siptemp->choice == SEQID_LOCAL) {
634 ValMessage (salp, Err_SeqId, SEV_ERROR, siptemp, NULL, 0);
635 }
636 siptemp->next = sipnext;
637 }
638 return;
639 }
640
641 /******************************************************************
642 return seqid for each seg.
643 Note that a newly created seqid chain is returned for stdseg
644 and you need to free the memory after you use it in this case
645 ******************************************************************/
646 static SeqIdPtr SeqIdInAlignSegs(Pointer segs, Uint1 segtype, SeqAlignPtr salp)
647 {
648
649 SeqIdPtr sip=NULL;
650 StdSegPtr ssp;
651 DenseDiagPtr ddp;
652 DenseSegPtr dsp;
653 PackSegPtr psp;
654 SeqLocPtr slp=NULL, slptemp;
655
656 if(!segs)
657 {
658 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
659 return NULL;
660 }
661 if(segtype==1)
662 { /* DenseDiag */
663
664 ddp=(DenseDiagPtr)segs;
665 sip=ddp->id;
666 }
667 else if (segtype==2)
668 { /* DenseSeg */
669
670 dsp = (DenseSegPtr) segs;
671 sip=dsp->ids;
672 }
673 else if (segtype==3)
674 { /* StdSeg */
675
676 ssp = (StdSegPtr)segs;
677 slp = ssp->loc;
678 /*make a new linked list of SeqId*/
679 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
680 AddSeqId(&sip, SeqLocId(slptemp));
681
682 }
683 else if(segtype==4)
684 { /* Packed Seg. Optimal for editing alignments */
685
686 psp = (PackSegPtr)segs;
687 if (psp!=NULL)
688 sip = psp->ids;
689 }
690 return sip;
691 }
692
693
694 /******************************************************************
695 validate SeqId in sequence alignment
696 ******************************************************************/
697 static void ValidateSeqIdInSeqAlign (SeqAlignPtr salp)
698 {
699 SeqIdPtr sip=NULL;
700 Pointer segptr=NULL;
701 DenseDiagPtr ddp=NULL, ddptemp;
702 StdSegPtr ssp=NULL, ssptemp;
703
704
705 if(salp)
706 {
707 segptr=salp->segs;
708 if(!segptr)
709 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
710 else
711 {
712
713 /*densediag */
714 if(salp->segtype==1)
715 {
716 /*cast to appropriate pointer*/
717 ddp=(DenseDiagPtr)segptr;
718 for(ddptemp=ddp; ddptemp!=NULL; ddptemp=ddptemp->next)
719 {
720
721 sip=SeqIdInAlignSegs((Pointer)ddptemp, salp->segtype, salp);
722 ValidateSeqId(sip, salp);
723 }
724 }
725
726 /*Stdseg*/
727 else if(salp->segtype==3)
728 {
729 /*cast to appropriate pointer*/
730 ssp=(StdSegPtr)segptr;
731 for(ssptemp=ssp; ssptemp!=NULL; ssptemp=ssptemp->next)
732 {
733
734 sip=SeqIdInAlignSegs((Pointer)ssptemp, salp->segtype, salp);
735 ValidateSeqId(sip, salp);
736 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
737 SeqIdSetFree(sip);
738 }
739 }
740
741 /*Denseseg, Packseg*/
742 else if(salp->segtype==2||salp->segtype==4)
743 {
744
745 sip=SeqIdInAlignSegs(segptr, salp->segtype, salp);
746 ValidateSeqId(sip, salp);
747 }
748 }
749 }
750 }
751
752 /******************************************************************
753 return true if two sip are the same, false otherwise.
754 Also return false if there is error in sip
755 ******************************************************************/
756 static Boolean SeqIdCmp (SeqIdPtr sip1, SeqIdPtr sip2)
757 {
758 Char buf1[256], buf2[256];
759
760 if(!sip1||!sip2)
761 return FALSE;
762
763 SeqIdWrite(sip1, buf1, PRINTID_FASTA_LONG, 255);
764 SeqIdWrite(sip2, buf2, PRINTID_FASTA_LONG, 255);
765 return(!StringCmp(buf1, buf2));
766
767 }
768
769
770 /******************************************************************
771 return the strand for a seqloc with seqid=sip in a stdseg.
772 Note, it returns 255 if null sip or ssp
773 ******************************************************************/
774 static Uint1 SeqLocStrandForSipInStdSeg (SeqIdPtr sip, StdSegPtr ssp, SeqAlignPtr salp)
775 {
776 SeqLocPtr slp, slptemp;
777 Uint1 strand=0;
778
779 if(!sip||!ssp)
780 return (255);
781
782 slp=ssp->loc;
783 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
784 {
785 if(SeqIdCmp(sip, SeqLocId(slptemp)))
786 {
787 strand=SeqLocStrand(slptemp);
788 break;
789 }
790 }
791 return strand;
792 }
793
794
795 /******************************************************************
796 check if the strand is consistent in Stdseg
797 ******************************************************************/
798 static void ValidateStrandInStdSeg(StdSegPtr ssp, SeqAlignPtr salp)
799 {
800 SeqIdPtr sip=NULL, sip_inseg=NULL;
801 Uint1 strand1=0, strand2=0;
802 StdSegPtr ssptemp, ssptemp2, ssptemp3;
803 SeqLocPtr slp, slptemp;
804 ValNodePtr FinishedSip=NULL, temp;
805 Boolean CheckedStatus;
806 Int4 start_numseg=0, end_numseg=0;
807
808 if(!ssp)
809 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
810 else
811 for(ssptemp=ssp; ssptemp!=NULL; ssptemp=ssptemp->next)
812 {
813 sip_inseg=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
814 start_numseg++;
815 slp=ssptemp->loc;
816 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
817 {
818
819 CheckedStatus=FALSE;
820 sip=SeqLocId(slptemp);
821 if(sip)
822 {
823 /*if a seqloc represented by a sip has been checked, set the checkedstatus flag to true so it will not be checked again*/
824 for(temp=FinishedSip; temp!=NULL; temp=temp->next)
825 {
826 if(SeqIdCmp(sip, temp->data.ptrvalue))
827 {
828 CheckedStatus=TRUE;
829 break;
830 }
831 }
832 /*seqloc not checked yet*/
833 if(!CheckedStatus)
834 {
835
836 /*keep a record of checked sip*/
837 ValNodeAddPointer(&FinishedSip, 0, sip);
838 end_numseg=start_numseg;
839 /*go through all segs to get at least two strand, if any, for this seqloc*/
840 for(ssptemp2=ssptemp; ssptemp2!=NULL; ssptemp2=ssptemp2->next, end_numseg++)
841 {
842 /*get the first defined strand */
843 strand1=SeqLocStrandForSipInStdSeg(sip, ssptemp2, salp);
844
845 if(strand1!=0&&strand1!=255)
846 {
847 ssptemp2=ssptemp2->next;
848 break;
849 }
850
851 }
852
853 if(strand1!=0&&strand1!=255)
854 /*continue to get next strand */
855 for(ssptemp3=ssptemp2; ssptemp3!=NULL; ssptemp3=ssptemp3->next, end_numseg++)
856 {
857 strand2=SeqLocStrandForSipInStdSeg(sip, ssptemp3, salp);
858 if(strand2==0||strand2==255)
859 continue;
860
861 if(strand2!=0&&strand2!=255)
862 /*strand should be same for a given seq*/
863 if(strand1!=strand2)
864
865 ValMessage (salp, Err_Strand_Rev, SEV_ERROR, sip, sip_inseg, end_numseg+1);
866 }
867 }
868 }
869 }
870 SeqIdSetFree(sip_inseg);
871
872 }
873
874 ValNodeFree(FinishedSip);
875 }
876
877
878 /******************************************************************
879 check if the strand is consistent in Denseseg
880 ******************************************************************/
881 static void ValidateStrandInPack_DenseSeg(Pointer segs, Uint1 segtype, SeqAlignPtr salp)
882 {
883 DenseSegPtr dsp=NULL;
884 PackSegPtr psp=NULL;
885 Int4 numseg, aligndim, dimnumseg, i, j, m;
886 SeqIdPtr sip=NULL, siptemp;
887 Uint1 strand1=0, strand2=0;
888 Uint1Ptr strandptr=NULL;
889
890 if(!segs)
891 {
892 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
893 }
894 else if(segtype==2||segtype==4)
895 {
896 if(segtype==2)
897 {
898 dsp=(DenseSegPtr)segs;
899 strandptr=dsp->strands;
900 sip=dsp->ids;
901 numseg=dsp->numseg;
902 aligndim=dsp->dim;
903 }
904 else if(segtype==4)
905 {
906 psp=(PackSegPtr)segs;
907 strandptr=psp->strands;
908 sip=psp->ids;
909 numseg=psp->numseg;
910 aligndim=psp->dim;
911 }
912
913 dimnumseg=numseg*aligndim;
914 if(strandptr)
915 {
916 /*go through id for each alignment sequence*/
917 for(j=0; j<aligndim; j++)
918 {
919 /* first strand value for each sequence*/
920 strand1=strandptr[j];
921 /* go through all strand values for each sequence*/
922 for(i=j+aligndim; i<dimnumseg; i=i+aligndim)
923 {
924 strand2=strandptr[i];
925
926 if(strand1==0||strand1==255)
927 {
928 strand1=strand2;
929 continue;
930 }
931
932 /*skip undefined strand*/
933 if(strand2!=0&&strand2!=255)
934 {
935 /*strand should be same for a given seq*/
936 if(strand1!=strand2)
937 {
938 /*find current seqid*/
939
940 siptemp=sip;
941 for(m=0; m<j&&siptemp!=NULL; m++)
942 {
943 siptemp=siptemp->next;
944 }
945 ValMessage (salp, Err_Strand_Rev, SEV_ERROR, siptemp, sip, i/aligndim+1);
946 }
947 }
948 }
949 }
950 }
951 }
952 }
953
954
955
956
957 /******************************************************************
958 check if the strand is consistent in SeqAlignment of global
959 or partial type
960 ******************************************************************/
961 static void ValidateStrandinSeqAlign(SeqAlignPtr salp)
962 {
963 StdSegPtr ssp=NULL ;
964
965 if(salp)
966 {
967
968 /*Strands needs to be validated in case of global or partial alignment*/
969
970 /*denseseg or packseg*/
971 if(salp->segtype==2||salp->segtype==4)
972
973 ValidateStrandInPack_DenseSeg(salp->segs, salp->segtype, salp);
974
975 /*stdseg*/
976 else if(salp->segtype==3)
977 {
978 ssp=(StdSegPtr)salp->segs;
979 ValidateStrandInStdSeg(ssp, salp);
980 }
981 }
982 }
983
984
985
986 /******************************************************************
987 Make sure that, in Densediag alignment, segment length and
988 start point is not less than zero, and segment length is not greater
989 than Bioseq length
990 ******************************************************************/
991 static void ValidateSeqlengthInDenseDiag (DenseDiagPtr ddp, SeqAlignPtr salp)
992 {
993 Int4Ptr stptr=NULL;
994 DenseDiagPtr ddptemp;
995 Int2 numseg, i;
996 SeqIdPtr sip=NULL, siptemp;
997 Int4 bslen;
998 BioseqPtr bsp=NULL;
999
1000
1001 if(!ddp)
1002 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1003 else
1004 {
1005 for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1006 {
1007 sip=ddp->id;
1008 stptr=ddptemp->starts;
1009
1010 if(stptr)
1011 {
1012 for(i=0, siptemp=sip; i<ddptemp->dim; i++, siptemp=siptemp->next)
1013 {
1014 bsp=AlignValBioseqLockById(siptemp);
1015 if(bsp)
1016 {
1017 bslen=bsp->length;
1018 AlignValBioseqUnlock (bsp);
1019 /*verify start*/
1020 if(stptr[i]<0)
1021 ValMessage (salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg);
1022 if(stptr[i]>=bslen)
1023 ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg);
1024
1025 /*verify length*/
1026
1027 if(ddptemp->len<0)
1028 ValMessage (salp, Err_Len_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg);
1029
1030 if(ddptemp->len+stptr[i]>bslen)
1031 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , numseg);
1032 }
1033 }
1034 }
1035 }
1036 }
1037 }
1038
1039
1040 /******************************************************************
1041 return a new copy of len array in reversed order
1042 ******************************************************************/
1043 static Int4Ptr GetReverseLength (Int2 numseg, Int4Ptr lenptr)
1044 {
1045 Int4Ptr lenptrtemp=NULL;
1046 Int2 p;
1047
1048 if(!lenptr)
1049 return NULL;
1050
1051 lenptrtemp=(Int4Ptr)MemNew(numseg*sizeof(Int4Ptr));
1052 if(!lenptrtemp)
1053 {
1054 ErrPostEx (SEV_ERROR, 0,0, "Warning:insufficient memory");
1055 return NULL;
1056 }
1057 for(p=0; p<numseg; p++)
1058 lenptrtemp[p]=lenptr[numseg-1-p];
1059 return lenptrtemp;
1060
1061 }
1062
1063 /******************************************************************
1064 return a new copy of start array in reversed "numseg" order .
1065 Note that the relative position of starts in each numseg has not changed.
1066 Example: original length={0, 0, 10, -1, 30, 10}, numseg=3,
1067 lens={10, 20, 40}, the reversed length={30, 10, 10, -1, 0, 0}
1068 ******************************************************************/
1069 static Int4Ptr GetReverseStart(Int2 numseg, Int2 dim, Int4Ptr stptr)
1070 {
1071 Int4Ptr stptrtemp=NULL;
1072 Int2 p, q;
1073
1074 if(!stptr)
1075 return NULL;
1076
1077 stptrtemp=(Int4Ptr)MemNew(numseg*dim*sizeof(Int4Ptr));
1078 if(!stptrtemp)
1079 {
1080 ErrPostEx (SEV_ERROR, 0,0, "Warning:insufficient memory");
1081 return NULL;
1082 }
1083 for(p=0; p<numseg; p++)
1084 for(q=0; q<dim; q++)
1085 stptrtemp[q+p*dim]=stptr[q+(numseg-1-p)*dim];
1086
1087 return stptrtemp;
1088 }
1089
1090
1091
1092 /******************************************************************
1093 Make sure that, in Denseseg alignment, segment length and
1094 start point agrees each other and the sum of segment length
1095 is not greater than Bioseq length
1096 ******************************************************************/
1097 static void ValidateSeqlengthInDenseSeg (DenseSegPtr dsp, SeqAlignPtr salp)
1098 {
1099
1100 Int4Ptr lenptr=NULL, stptr=NULL, lenptrtemp=NULL, stptrtemp=NULL, lenptrtemp2=NULL, stptrtemp2=NULL;
1101
1102 Int2 numseg, aligndim, i, j;
1103 SeqIdPtr sip=NULL, siptemp;
1104 Int4 bslen = 0;
1105 BioseqPtr bsp=NULL;
1106
1107 if(!dsp)
1108 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1109 else
1110 {
1111 numseg=dsp->numseg;
1112 aligndim=dsp->dim;
1113
1114 stptr=dsp->starts;
1115 lenptr=dsp->lens;
1116 sip=dsp->ids;
1117
1118 if(stptr==NULL||lenptr==NULL)
1119 return;
1120
1121
1122 /*go through each sequence*/
1123 for(j=0, siptemp=sip; j<aligndim&&siptemp; j++, siptemp=siptemp->next)
1124 {
1125
1126 lenptrtemp=lenptr;
1127 stptrtemp=stptr;
1128 /*if on minus strand, use reversed length and start array*/
1129 if(dsp->strands)
1130 {
1131 if(dsp->strands[j]==Seq_strand_minus)
1132 {
1133 if(!lenptrtemp2&&!stptrtemp2)
1134 {
1135 lenptrtemp2= GetReverseLength (numseg, lenptr);
1136 if (lenptrtemp2==NULL)
1137 return;
1138 stptrtemp2= GetReverseStart (numseg, aligndim, stptr);
1139 if (stptrtemp2==NULL)
1140 return;
1141 }
1142 lenptrtemp=lenptrtemp2;
1143 stptrtemp=stptrtemp2;
1144 }
1145 }
1146
1147 bsp=AlignValBioseqLockById(siptemp);
1148 if(bsp!=NULL)
1149 {
1150 bslen=bsp->length;
1151 AlignValBioseqUnlock (bsp);
1152 }
1153
1154 /*go through each segment for a given sequence*/
1155 for(i=0; i<numseg; i++)
1156 {
1157
1158 /*no need to verify if segment is not present*/
1159 if(stptrtemp[j+i*aligndim]!=-1)
1160 {
1161
1162 /*length plus start should be equal to next start*/
1163 /*check a start if it's not the last one and the next start is not -1*/
1164 if(i!=numseg-1&&stptrtemp[j+(i+1)*aligndim]!=-1)
1165 {
1166
1167 if(stptrtemp[j+i*aligndim]+lenptrtemp[i]!=stptrtemp[j+(i+1)*aligndim])
1168 {
1169 if (dsp->strands)
1170 {
1171 if(dsp->strands[j]==2)
1172 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , numseg-i);
1173 else
1174 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1175 }
1176 else
1177 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1178 }
1179 }
1180 /*check a start if it's not the last one and the next start is -1*/
1181 else if (i!=numseg-1&&stptrtemp[j+(i+1)*aligndim]==-1)
1182 {
1183 Int4 k=i+1;
1184 /*find the next start that is not last and not -1*/
1185 while(k<numseg&&stptrtemp[j+k*aligndim]==-1)
1186 k++;
1187
1188 /*length plus start should be equal to the closest next start that is not -1*/
1189
1190 if(k<numseg&&stptrtemp[j+i*aligndim]+lenptrtemp[i]!=stptrtemp[j+k*aligndim])
1191 {
1192 if (dsp->strands)
1193 {
1194 if(dsp->strands[j]==2)
1195 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , numseg-i);
1196 else
1197 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1198 }
1199 else
1200 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1201 }
1202 }
1203
1204
1205 /*make sure the start plus segment does not exceed total bioseq length*/
1206 if(bsp!=NULL)
1207 {
1208
1209 if(stptrtemp[j+i*aligndim]+lenptrtemp[i]>bslen)
1210 if (dsp->strands)
1211 {
1212 if(dsp->strands[j]==2)
1213 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , numseg-1);
1214 else
1215 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1216 }
1217 else
1218 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1219 }
1220
1221 }
1222
1223 }
1224 }
1225 }
1226
1227
1228 MemFree(lenptrtemp2);
1229 MemFree(stptrtemp2);
1230
1231
1232 }
1233
1234 /******************************************************************
1235 Make sure that, in Seqloc of a Stdseg alignment,
1236 end point, start point and length are not less than zero,
1237 and are not greater than Bioseq length
1238 ******************************************************************/
1239 static void ValidateSeqlengthInStdSeg (StdSegPtr ssp, SeqAlignPtr salp)
1240 {
1241 StdSegPtr ssptemp;
1242 Int2 numseg;
1243 SeqIdPtr sip=NULL, siptemp;
1244 Int4 start, end, length, bslen;
1245 BioseqPtr bsp=NULL;
1246 SeqLocPtr slp=NULL, slptemp;
1247
1248 if(!ssp) {
1249 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1250 } else {
1251 for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++) {
1252 /*get all seqid in current segment*/
1253 sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1254 slp=ssptemp->loc;
1255 if(slp==NULL)
1256 return;
1257 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next) {
1258 siptemp=SeqLocId(slptemp);
1259 start=SeqLocStart(slptemp);
1260 end=SeqLocStop(slptemp);
1261 length=SeqLocLen(slptemp);
1262
1263 bsp=AlignValBioseqLockById(siptemp);
1264 if(bsp) {
1265 bslen=bsp->length;
1266 AlignValBioseqUnlock (bsp);
1267
1268 /*verify start*/
1269 if(start<0) {
1270 ValMessage (salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);
1271 }
1272
1273 if(start>bslen-1) {
1274 ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);
1275 }
1276
1277 /*verify end*/
1278 if(end<0) {
1279 ValMessage (salp, Err_End_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);
1280 }
1281 if(end>bslen-1) {
1282 ValMessage (salp, Err_End_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);
1283 }
1284
1285 /*verify length*/
1286 if(length<0) {
1287 ValMessage (salp, Err_Len_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);
1288 }
1289
1290 if(length>bslen) {
1291 ValMessage (salp, Err_Len_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);
1292 }
1293
1294 }
1295 }
1296 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1297 SeqIdSetFree(sip);
1298 }
1299 }
1300 }
1301
1302 /******************************************************************
1303 validate the start and segment length in packseg
1304 ******************************************************************/
1305 static void ValidateSeqlengthInPackSeg (PackSegPtr psp, SeqAlignPtr salp)
1306 {
1307 Uint1Ptr seqpresence=NULL;
1308 Int2 numseg, aligndim, i, j;
1309 SeqIdPtr sip=NULL, siptemp;
1310 Int4Ptr stptr=NULL, lenptr=NULL;
1311 BioseqPtr bsp=NULL;
1312 Int4 bslen, seg_start;
1313
1314 if(!psp)
1315 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1316 else
1317 {
1318 numseg=psp->numseg;
1319 aligndim=psp->dim;
1320 sip=psp->ids;
1321 stptr=psp->starts;
1322 lenptr=psp->lens;
1323
1324 if(stptr&&lenptr)
1325 {
1326 if(psp->present)
1327 {
1328 BSSeek(psp->present, 0, SEEK_SET);
1329 seqpresence=MemNew(BSLen(psp->present));
1330 if(!seqpresence)
1331 {
1332
1333 ErrPostEx (SEV_ERROR, 0,0, "Warning:insufficient memory");
1334 return;
1335
1336 }
1337 BSRead(psp->present, seqpresence, BSLen(psp->present));
1338 /*go through each sequence*/
1339 for(j=0, siptemp=sip; j<aligndim && siptemp != NULL; siptemp=siptemp->next, j++)
1340 {
1341 bsp=AlignValBioseqLockById(siptemp);
1342 if(bsp)
1343 {
1344 bslen=bsp->length;
1345 AlignValBioseqUnlock (bsp);
1346 seg_start=stptr[j];
1347 /*check start*/
1348 if(seg_start<0)
1349 ValMessage (salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip , 0);
1350 if(seg_start>=bslen)
1351 ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , 0);
1352
1353 /*go through each segment*/
1354 for(i=0; i<numseg; i++)
1355 {
1356 /*if this segment is present*/
1357 if(seqpresence[(i*aligndim+j)/8]&jybitnum[(i*aligndim+j)%8])
1358 {
1359 /*check start plus seg length*/
1360 seg_start=seg_start+lenptr[i];
1361 if(seg_start>bslen)
1362 ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip, numseg);
1363 }
1364 }
1365 }
1366 }
1367 }
1368 }
1369 }
1370 MemFree(seqpresence);
1371 }
1372
1373 /******************************************************************
1374 check segment length, start and end point in Denseseg, Densediag and Stdseg
1375 ******************************************************************/
1376 static void ValidateSeqlengthinSeqAlign (SeqAlignPtr salp)
1377 {
1378
1379 if (salp)
1380 {
1381 if(salp->segtype==1)
1382 ValidateSeqlengthInDenseDiag ((DenseDiagPtr)salp->segs, salp);
1383 else if(salp->segtype==2)
1384 ValidateSeqlengthInDenseSeg ((DenseSegPtr)salp->segs, salp);
1385 else if(salp->segtype==3)
1386 ValidateSeqlengthInStdSeg ((StdSegPtr)salp->segs, salp);
1387 else if(salp->segtype==4)
1388 ValidateSeqlengthInPackSeg ((PackSegPtr)salp->segs, salp);
1389 }
1390 }
1391
1392 /******************************************************************
1393 check if # of seqid matches the dimensions, and
1394 if there is only one seqeuence in seqalign
1395 ******************************************************************/
1396 static void ValidateDimSeqIds (SeqAlignPtr salp)
1397 {
1398 SeqIdPtr sip=NULL;
1399 DenseDiagPtr ddp=NULL, ddptemp;
1400 StdSegPtr ssp=NULL, ssptemp;
1401 DenseSegPtr dsp=NULL;
1402 Int4 numseg=0;
1403
1404 if(salp)
1405 {
1406 /*densediag */
1407 if(salp->segtype==1)
1408 {
1409
1410 ddp=(DenseDiagPtr)salp->segs;
1411 if(!ddp)
1412 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1413 else
1414 for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1415 {
1416 sip=ddptemp->id;
1417 if(ddptemp->dim==1)
1418 ValMessage (salp, Err_Segs_Dim_One, SEV_ERROR, NULL, sip , numseg+1);
1419 if(ddptemp->dim!=CountSeqIdInSip(sip))
1420 ValMessage (salp, Err_Segs_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , numseg+1);
1421
1422 }
1423 }
1424
1425 /*denseseg, packseg */
1426 else if(salp->segtype==2||salp->segtype==4)
1427 {
1428 dsp=(DenseSegPtr) (salp->segs);
1429 if(!dsp)
1430 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1431 else
1432 {
1433 sip=dsp->ids;
1434 if(dsp->dim==1)
1435 ValMessage (salp, Err_SeqAlign_Dim_One, SEV_ERROR, NULL, sip , 0);
1436 if(dsp->dim!=CountSeqIdInSip(sip))
1437 ValMessage (salp, Err_SeqAlign_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , 0);
1438
1439 }
1440 }
1441
1442 /*stdseg */
1443 else if(salp->segtype==3)
1444 {
1445
1446 ssp=(StdSegPtr)salp->segs;
1447 if(!ssp)
1448 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1449 else
1450 for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++)
1451 {
1452
1453 sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1454 if(ssptemp->dim==1)
1455 ValMessage (salp, Err_Segs_Dim_One, SEV_ERROR, NULL, sip , numseg+1);
1456 if(ssptemp->dim!=CountSeqIdInSip( sip))
1457 ValMessage (salp, Err_Segs_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , numseg+1);
1458 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1459
1460 SeqIdSetFree(sip);
1461 }
1462 }
1463 }
1464 }
1465
1466 /******************************************************************
1467 return true if a sip is contained in a seg, or false if otherwise
1468 Note it returns FASLE for an empty seqloc.
1469 It also returns false if error in sip or ssp
1470 ******************************************************************/
1471 static Boolean IsSipContainedInStdseg(SeqIdPtr sip, StdSegPtr ssp)
1472 {
1473 SeqLocPtr slp, slptemp;
1474
1475 if(!sip||!ssp)
1476 return FALSE;
1477
1478 slp=ssp->loc;
1479 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
1480 {
1481 if(slptemp->choice!=SEQLOC_EMPTY&&SeqIdCmp(sip, SeqLocId(slptemp)))
1482 return TRUE;
1483 }
1484
1485 return FALSE;
1486 }
1487
1488 static Int4 PercentStringMatch (CharPtr string1, CharPtr string2)
1489 {
1490 Int4 len1, len2, min_len, k, max_len;
1491 Int4 num_match = 0;
1492
1493 if (StringHasNoText (string1) || StringHasNoText (string2))
1494 {
1495 return 0;
1496 }
1497 len1 = StringLen (string1);
1498 len2 = StringLen (string2);
1499
1500 if (len1 > len2)
1501 {
1502 min_len = len2;
1503 max_len = len1;
1504 }
1505 else
1506 {
1507 min_len = len1;
1508 max_len = len2;
1509 }
1510
1511 for (k = 0; k < min_len; k++)
1512 {
1513 if (string1[k] == string2[k] || string1[k] == 'N' || string2[k] == 'N')
1514 {
1515 num_match++;
1516 }
1517 }
1518 return (100 * num_match) / min_len;
1519 }
1520
1521 static Boolean CheckForPercentMatch (SeqIdPtr sip_list)
1522 {
1523 SeqIdPtr sip_temp, sip_next;
1524 BioseqPtr bsp;
1525 CharPtr master_seq = NULL, this_seq = NULL;
1526
1527 if (sip_list == NULL) return FALSE;
1528 sip_next = sip_list->next;
1529 sip_list->next = NULL;
1530 bsp = BioseqFind (sip_list);
1531 if (bsp != NULL)
1532 {
1533 master_seq = GetSequenceByBsp (bsp);
1534 }
1535 sip_list->next = sip_next;
1536 sip_temp = sip_next;
1537 if (bsp == NULL || master_seq == NULL)
1538 {
1539 return FALSE;
1540 }
1541
1542 for (sip_temp = sip_next; sip_temp != NULL; sip_temp = sip_next)
1543 {
1544 sip_next = sip_temp->next;
1545 sip_temp->next = NULL;
1546
1547 bsp = BioseqFind (sip_temp);
1548 if (bsp != NULL)
1549 {
1550 this_seq = GetSequenceByBsp (bsp);
1551 } else {
1552 this_seq = NULL;
1553 }
1554
1555 sip_temp->next = sip_next;
1556 if (bsp == NULL || StringHasNoText (this_seq) || PercentStringMatch (master_seq, this_seq) < 50)
1557 {
1558 MemFree (this_seq);
1559 return FALSE;
1560 }
1561 MemFree (this_seq);
1562 }
1563 return TRUE;
1564 }
1565
1566
1567 /******************************************************************
1568 check if an alignment is FASTA-like.
1569 If all gaps are at the 3' ends with dimensions>2, it's FASTA-like
1570 ******************************************************************/
1571 static Boolean Is_Fasta_Seqalign (SeqAlignPtr salp)
1572 {
1573
1574 SeqIdPtr siptemp=NULL;
1575 DenseSegPtr dsp;
1576 Int4Ptr startp;
1577 Boolean gap;
1578 Int4 k;
1579 Int2 j;
1580 SeqIdPtr bad_sip = NULL;
1581
1582 /*check only global or partial type*/
1583 if(salp->type!=1&&salp->type!=3)
1584 return FALSE;
1585
1586 if (salp->segtype != SAS_DENSEG) {
1587 ValMessage (salp, Err_Unexpected_Alignment_Type, SEV_ERROR, NULL, NULL, 0);
1588 } else {
1589 dsp = (DenseSegPtr) salp->segs;
1590 if(!dsp)
1591 {
1592 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1593 }
1594 else
1595 {
1596 if(dsp->dim<=2)
1597 {
1598 return FALSE;
1599 }
1600 /* if any sequence has gaps at the 5' end or internal gaps, the entire
1601 * alignment is declared to be valid.
1602 * if the sequence contains no gaps at all or only 3' end gaps, check
1603 * sequences for matches against the first sequence - if more than half
1604 * of the nucleotides are matches, then call this not FASTA-like.
1605 */
1606 for (j=0, siptemp=dsp->ids; j<dsp->dim&&siptemp; j++, siptemp=siptemp->next)
1607 {
1608 gap=FALSE;
1609
1610 for (k=0; k<dsp->numseg; k++)
1611 {
1612 startp=dsp->starts;
1613
1614 /*if start value is -1, set gap flag to true*/
1615 if (startp[dsp->dim*k + j] < 0)
1616 {
1617 gap = TRUE;
1618 }
1619 /*if a positive start value is found after the initial -1 start value, then it's not fasta like, no need to check this sequence further */
1620 else if(gap)
1621 {
1622 if (bad_sip != NULL)
1623 {
1624 SeqIdFree (bad_sip);
1625 }
1626 return FALSE;
1627 }
1628 /* if no positive start value is found after the initial -1 start value
1629 * (indicating that gaps exist only at the 5' end) or if no gaps
1630 * were found at all, flag this sequence as bad if it is the first found.
1631 */
1632 if(k==dsp->numseg-1)
1633 {
1634 if (bad_sip == NULL)
1635 {
1636 bad_sip = SeqIdDup (siptemp);
1637 }
1638 }
1639 }
1640 }
1641 if (bad_sip != NULL)
1642 {
1643 if (! CheckForPercentMatch (dsp->ids))
1644 {
1645 ValMessage (salp, Err_Fastalike, SEV_WARNING, bad_sip, dsp->ids, 0);
1646 SeqIdFree (bad_sip);
1647 return TRUE;
1648 }
1649 SeqIdFree (bad_sip);
1650 return FALSE;
1651 }
1652 }
1653 }
1654 /*no fasta like sequence is found*/
1655 return FALSE;
1656
1657 }
1658
1659
1660
1661 /******************************************************************
1662 check if there is a gap for all sequence in a segment
1663 ******************************************************************/
1664 static void Segment_Gap_In_SeqAlign(SeqAlignPtr salp)
1665 {
1666 Int4Ptr stptr=NULL;
1667 DenseSegPtr dsp=NULL;
1668 DenseDiagPtr ddp=NULL, ddptemp;
1669 StdSegPtr ssp=NULL, ssptemp;
1670 PackSegPtr psp=NULL;
1671 Uint1Ptr seqpresence=NULL;
1672 Int2 numseg, aligndim, i, j;
1673 SeqIdPtr sip=NULL;
1674 SeqLocPtr slp=NULL, slptemp;
1675
1676
1677 if(salp)
1678 {
1679 /*densediag*/
1680 if(salp->segtype==1)
1681 {
1682 ddp=(DenseDiagPtr)salp->segs;
1683 if(!ddp)
1684 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1685 else
1686 {
1687 for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1688 {
1689 sip=ddptemp->id;
1690 /*empty segment*/
1691 if(ddptemp->dim==0)
1692 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, numseg);
1693 }
1694 }
1695 }
1696
1697
1698 /*denseseg*/
1699 else if(salp->segtype==2)
1700 {
1701 dsp=(DenseSegPtr)salp->segs;
1702 if(!dsp)
1703 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1704 else
1705 {
1706 numseg=dsp->numseg;
1707 aligndim=dsp->dim;
1708 stptr=dsp->starts;
1709 sip=dsp->ids;
1710
1711 if(stptr==NULL)
1712 return;
1713
1714 /*go through each segment*/
1715 for(j=0; j<numseg; j++)
1716 {
1717 /*go through each sequence */
1718 for(i=0; i<aligndim; i++)
1719 {
1720
1721 if(stptr[j*aligndim+i]==-1)
1722 {
1723 /*all starts are -1 in this segment*/
1724 if(i==aligndim-1)
1725 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, j);
1726 }
1727 /*at least one start that is not -1*/
1728 else
1729 break;
1730
1731 }
1732 }
1733 }
1734 }
1735
1736 /*stdseg*/
1737 else if(salp->segtype==3)
1738 {
1739 ssp=(StdSegPtr)salp->segs;
1740 if(!ssp)
1741 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1742 else
1743 {
1744 /*go through each segment*/
1745 for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++)
1746 {
1747 sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1748 slp=ssptemp->loc;
1749 /*go through each sequence*/
1750 for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
1751 {
1752 if(slptemp->choice==SEQLOC_EMPTY||slptemp->choice==SEQLOC_NULL)
1753 {
1754 if(slptemp->next)
1755 continue;
1756 /*all seqloc are empty*/
1757 else
1758 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, numseg);
1759 }
1760 /*at least one non-empty seqloc*/
1761 else
1762 break;
1763 }
1764 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1765 SeqIdSetFree(sip);
1766
1767 }
1768 }
1769 }
1770 /*packseg*/
1771 else if(salp->segtype==4)
1772 {
1773 psp=(PackSegPtr)salp->segs;
1774 if(!psp)
1775 ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1776 else
1777 {
1778 numseg=psp->numseg;
1779 aligndim=psp->dim;
1780 sip=psp->ids;
1781 if(psp->present)
1782 {
1783 BSSeek(psp->present, 0, SEEK_SET);
1784 seqpresence=MemNew(BSLen(psp->present));
1785 if(!seqpresence)
1786 {
1787 ErrPostEx (SEV_ERROR, 0,0, "Warning:insufficient memory");
1788 return;
1789
1790 }
1791 BSRead(psp->present, seqpresence, BSLen(psp->present));
1792
1793 /*go through each segment*/
1794 for(j=0; j<numseg; j++)
1795 {
1796 /*go through each sequence */
1797 for(i=0; i<aligndim; i++)
1798 {
1799 /*check the presence of each sequence by determining the bit value in a byte (0, not present; otherwise present)*/
1800 if(!(seqpresence[(j*aligndim+i)/8]&jybitnum[(j*aligndim+i)%8]))
1801 {
1802 /*more sequence to go*/
1803 if(i<aligndim-1)
1804 continue;
1805 /*no sequence is present in this segment*/
1806 else if(i==aligndim-1)
1807 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, j);
1808 }
1809 /*at least one sequence is present*/
1810 else
1811 break;
1812 }
1813 }
1814 MemFree(seqpresence);
1815 }
1816 }
1817
1818 }
1819
1820
1821 }
1822 }
1823
1824
1825 static Boolean IsAlignmentTPA (SeqAlignPtr salp)
1826 {
1827 Boolean isTPA = FALSE;
1828 BioseqPtr bsp;
1829 SeqIdPtr sip = NULL, tmp_sip;
1830 SeqEntryPtr oldscope;
1831 DenseDiagPtr ddp;
1832 StdSegPtr ssp;
1833
1834 if (salp == NULL) {
1835 return FALSE;
1836 }
1837
1838 oldscope = SeqEntrySetScope (NULL);
1839
1840 switch (salp->segtype) {
1841 case SAS_DENDIAG:
1842 /*densediag */
1843 for (ddp = (DenseDiagPtr) salp->segs; ddp != NULL && !isTPA; ddp = ddp->next) {
1844 for (sip = SeqIdInAlignSegs((Pointer)ddp, salp->segtype, salp);
1845 sip != NULL && !isTPA;
1846 sip = sip->next) {
1847 bsp = BioseqLockById(sip);
1848 isTPA = HasTpaUserObject(bsp);
1849 BioseqUnlock(bsp);
1850 }
1851 }
1852 break;
1853 case SAS_STD:
1854 /*Stdseg*/
1855 for (ssp = (StdSegPtr) salp->segs; ssp != NULL && !isTPA; ssp = ssp->next) {
1856 sip = SeqIdInAlignSegs((Pointer)ssp, salp->segtype, salp);
1857 for (tmp_sip = sip;
1858 tmp_sip != NULL && !isTPA;
1859 tmp_sip = tmp_sip->next) {
1860 bsp = BioseqLockById(tmp_sip);
1861 isTPA = HasTpaUserObject(bsp);
1862 BioseqUnlock(bsp);
1863 }
1864 }
1865 /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1866 SeqIdSetFree(sip);
1867 break;
1868 case SAS_DENSEG:
1869 case SAS_PACKED:
1870 /*Denseseg, Packseg*/
1871 for (sip=SeqIdInAlignSegs(salp->segs, salp->segtype, salp);
1872 sip != NULL && !isTPA;
1873 sip = sip->next) {
1874 bsp = BioseqLockById(sip);
1875 isTPA = HasTpaUserObject(bsp);
1876 BioseqUnlock(bsp);
1877 }
1878 break;
1879 }
1880
1881 SeqEntrySetScope (oldscope);
1882 return isTPA;
1883 }
1884
1885
1886 static void CheckAlnSeqLens (SeqAlignPtr salp)
1887 {
1888 Int4 aln_len, start, stop;
1889 Int4 num_rows, row;
1890 SeqIdPtr sip;
1891 BioseqPtr bsp;
1892 Boolean is_shorter = FALSE;
1893
1894 if (salp == NULL) return;
1895
1896 aln_len = AlnMgr2GetAlnLength(salp, FALSE);
1897 num_rows = AlnMgr2GetNumRows(salp);
1898 if (num_rows < 0) {
1899 return;
1900 }
1901
1902 for (row = 1; row <= num_rows && !is_shorter; row++) {
1903 sip = AlnMgr2GetNthSeqIdPtr(salp, row);
1904 bsp = BioseqFind (sip);
1905 if (bsp != NULL && bsp->idx.entityID == salp->idx.entityID) {
1906 AlnMgr2GetNthSeqRangeInSA(salp, row, &start, &stop);
1907 if ((stop > start && stop < bsp->length - 1) || (start > stop && start > bsp->length - 1)) {
1908 is_shorter = TRUE;
1909 }
1910 }
1911 sip = SeqIdFree (sip);
1912 }
1913 if (is_shorter) {
1914 ValMessage (salp, Err_Short_Aln, SEV_INFO, NULL, NULL, 0);
1915 }
1916 }
1917
1918
1919 /******************************************************************
1920 validate seqid, segment length, strand in Seqalignment for Denseseg,
1921 Densediag and Stdseg. Also check if it's FASTA-like
1922 ******************************************************************/
1923 static Boolean ValidateSeqAlignFunc (SeqAlignPtr salp, Boolean find_remote_bsp)
1924 {
1925 Boolean error=FALSE;
1926 Uint2 pcnt_identity;
1927 SeqAlignPtr salp_test;
1928
1929 if(salp==NULL)
1930 return FALSE;
1931
1932 /*validate if dimesion equals number of seqid*/
1933 ValidateDimSeqIds (salp);
1934
1935 if (find_remote_bsp) {
1936 ValidateSeqIdInSeqAlign (salp);
1937 ValidateSeqlengthinSeqAlign (salp);
1938 }
1939 /*validate strand*/
1940 ValidateStrandinSeqAlign (salp);
1941
1942 /*validate Fasta like*/
1943 if (Is_Fasta_Seqalign (salp))
1944 {
1945 error = TRUE;
1946 }
1947
1948 /*validate segment gap*/
1949 Segment_Gap_In_SeqAlign (salp);
1950
1951 if (!IsAlignmentTPA(salp)) {
1952 if (salp->segtype == SAS_DENDIAG) {
1953 /* duplicate alignment, to prevent indexing from changing the original type */
1954 salp_test = SeqAlignDup (salp);
1955 pcnt_identity = AlignmentPercentIdentityEx (salp_test, FALSE, TRUE);
1956 salp_test = SeqAlignFree (salp_test);
1957 } else {
1958 pcnt_identity = AlignmentPercentIdentityEx (salp, FALSE, TRUE);
1959 }
1960
1961 if (pcnt_identity < 50) {
1962 ValMessage (salp, Err_Pcnt_ID, SEV_WARNING, NULL, NULL, pcnt_identity);
1963 }
1964
1965 /* CheckAlnSeqLens (salp); */
1966 }
1967
1968 return error;
1969 }
1970
1971
1972 /******************************************************************
1973 validate each alignment sequentially.
1974 This function will subject the seqalign to all validation functions
1975 ******************************************************************/
1976 NLM_EXTERN Boolean ValidateSeqAlign (SeqAlignPtr salp, Uint2 entityID, Boolean message,
1977 Boolean msg_success, Boolean find_remote_bsp,
1978 Boolean delete_bsp, Boolean delete_salp, BoolPtr dirty)
1979 {
1980 SeqAlignPtr pre,
1981 salptmp;
1982 SaVal sv;
1983 SaValPtr svp;
1984 ValNodePtr vnp;
1985 JYErrorMsgPtr bemp;
1986 MsgAnswer ans;
1987 Int2 err_count=0,
1988 salp_count=0;
1989 Boolean retdel = FALSE;
1990
1991 if(salp!=NULL)
1992 {
1993 sv.message = message;
1994 sv.msg_success = msg_success;
1995 sv.find_remote_bsp = find_remote_bsp;
1996 sv.delete_salp = delete_salp;
1997 sv.delete_bsp = delete_bsp;
1998 sv.retdel = TRUE;
1999 sv.do_hist_assembly = FALSE;
2000 sv.ids = NULL;
2001 sv.entityID = entityID;
2002 sv.dirty = FALSE;
2003 svp = &sv;
2004 pre=NULL;
2005 salptmp=salp;
2006 while (salptmp)
2007 {
2008 salp_count++;
2009 if (salptmp->segtype == SAS_SPARSE) {
2010 ValMessage (salp, Err_Segtype, SEV_WARNING, NULL, NULL, salptmp->segtype);
2011 } else if (salptmp->segtype == SAS_SPLICED) {
2012 ValMessage (salp, Err_Segtype, SEV_WARNING, NULL, NULL, salptmp->segtype);
2013 }
2014 else if (salptmp->segtype==5)
2015 {
2016 ValidateSeqAlign ((SeqAlignPtr) (salptmp->segs), entityID, message, msg_success, find_remote_bsp, delete_bsp, delete_salp, &svp->dirty);
2017 }
2018 else if (salptmp->segtype<1 || salptmp->segtype>4)
2019 {
2020 ValMessage (salp, Err_Segtype, SEV_ERROR, NULL, NULL, salptmp->segtype);
2021 }
2022 else {
2023 ValidateSeqAlignFunc (salptmp, svp->find_remote_bsp);
2024 }
2025 if (errorp)
2026 {
2027 if(svp->message)
2028 {
2029 for (vnp=errorp; vnp!=NULL; vnp=vnp->next)
2030 {
2031 bemp=(JYErrorMsgPtr)vnp->data.ptrvalue;
2032 ErrPostEx ((ErrSev) bemp->level, 0, 0, bemp->msg);
2033 }
2034 }
2035 errorp = JYErrorChainDestroy (errorp);
2036 if (svp->delete_salp)
2037 {
2038 if (pre==NULL) {
2039 salp=salptmp->next;
2040 salptmp->next = NULL;
2041 SeqAlignFree (salptmp);
2042 salptmp = salp;
2043 }
2044 else {
2045 pre->next = salptmp->next;
2046 salptmp->next = NULL;
2047 SeqAlignFree (salptmp);
2048 salptmp = pre->next;
2049 }
2050 }
2051 else {
2052 salptmp = salptmp->next;
2053 }
2054 err_count++;
2055 svp->retdel=FALSE;
2056 }
2057 else {
2058 salptmp = salptmp->next;
2059 }
2060 }
2061 if (err_count==0 && svp->msg_success) {
2062 if (salp_count>1)
2063 ans = Message (MSG_OK, "Validation test of %d alignments succeeded", salp_count);
2064 else
2065 ans = Message (MSG_OK, "Validation test of the alignment succeeded");
2066 }
2067 if (dirty)
2068 *dirty = svp->dirty;
2069 retdel = svp->retdel;
2070 }
2071 return retdel;
2072 }
2073
2074
2075 /******************************************************************
2076 call back function for REGISTER_ALIGNVALIDATION defined in sequin4.c.
2077 Starting point for seqalign validation if user clicked on
2078 SeqalignValidation under menu Filer/Alignment.
2079 Either individual alignment or alignment block
2080 should be highlighted for this validation to work
2081 ******************************************************************/
2082
2083 NLM_EXTERN Int2 LIBCALLBACK ValidateSeqAlignFromData (Pointer data)
2084 {
2085
2086 OMProcControlPtr ompcp;
2087 SeqAlignPtr salp=NULL;
2088 SeqAnnotPtr sap=NULL;
2089 SeqEntryPtr sep=NULL;
2090
2091 ompcp = (OMProcControlPtr) data;
2092 if (ompcp == NULL || ompcp->proc == NULL) return OM_MSG_RET_ERROR;
2093
2094 if (ompcp->input_data == NULL) return OM_MSG_RET_ERROR;
2095
2096 switch(ompcp->input_itemtype)
2097 {
2098 case OBJ_BIOSEQ :
2099 sep = SeqMgrGetSeqEntryForData (ompcp->input_data);
2100 break;
2101 case OBJ_BIOSEQSET :
2102 sep = SeqMgrGetSeqEntryForData (ompcp->input_data);
2103 break;
2104 /*if clicked on alignment block*/
2105 case OBJ_SEQANNOT:
2106 sap=(SeqAnnotPtr) (ompcp->input_data);
2107 break;
2108 /*if clicked on individual alignment*/
2109 case OBJ_SEQALIGN:
2110 salp=(SeqAlignPtr) (ompcp->input_data);
2111 break;
2112 case 0 :
2113 return OM_MSG_RET_ERROR;
2114 default :
2115 return OM_MSG_RET_ERROR;
2116 }
2117
2118 ErrSetMessageLevel(SEV_ERROR);
2119 if(sap!=NULL)
2120 {
2121 salp=is_salp_in_sap(sap, 2);
2122 ValidateSeqAlign (salp, 0, TRUE, TRUE, TRUE, FALSE, FALSE, NULL);
2123 }
2124 if (salp!=NULL) {
2125 ValidateSeqAlign (salp, 0, TRUE, TRUE, TRUE, FALSE, FALSE, NULL);
2126 }
2127 if (sep!=NULL) {
2128 ValidateSeqAlignInSeqEntry (sep, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE);
2129 }
2130 return OM_MSG_RET_DONE;
2131 }
2132
2133 static void ValidateSeqAlignInAnnot (SeqAnnotPtr sap, SaValPtr svp)
2134
2135 {
2136 SeqAlignPtr salp;
2137
2138 while (sap != NULL) {
2139 if (sap->type == 2) {
2140 salp = (SeqAlignPtr) sap->data;
2141 if (salp != NULL) {
2142 ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty);
2143 }
2144 }
2145 sap = sap->next;
2146 }
2147 }
2148
2149 static void ValidateSeqAlignInHist (SeqHistPtr hist, SaValPtr svp)
2150
2151 {
2152 SeqAlignPtr salp;
2153
2154 if (hist == NULL) return;
2155 salp = hist->assembly;
2156 /* ValidateSeqAlign will validate the entire chain */
2157 ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty);
2158 }
2159
2160 static void ValidateSeqAlignCallback (SeqEntryPtr sep, Pointer mydata,
2161 Int4 index, Int2 indent)
2162 {
2163 BioseqPtr bsp;
2164 BioseqSetPtr bssp;
2165 SaValPtr svp;
2166
2167 if (sep != NULL && sep->data.ptrvalue && mydata != NULL) {
2168 svp = (SaValPtr)mydata;
2169 if (IS_Bioseq(sep)) {
2170 bsp = (BioseqPtr) sep->data.ptrvalue;
2171 if (bsp!=NULL) {
2172 ValidateSeqAlignInAnnot (bsp->annot, svp);
2173 if (svp != NULL && svp->do_hist_assembly) {
2174 ValidateSeqAlignInHist (bsp->hist, svp);
2175 }
2176 }
2177 }
2178 else if(IS_Bioseq_set(sep)) {
2179 bssp = (BioseqSetPtr)sep->data.ptrvalue;
2180 if (bssp!=NULL) {
2181 ValidateSeqAlignInAnnot (bssp->annot, svp);
2182 }
2183 }
2184 }
2185 }
2186
2187
2188
2189 NLM_EXTERN Boolean ValidateSeqAlignInSeqEntry (SeqEntryPtr sep, Boolean message,
2190 Boolean msg_success, Boolean find_remote_bsp,
2191 Boolean delete_bsp, Boolean delete_salp,
2192 Boolean do_hist_assembly)
2193 {
2194 SeqEntryPtr sep_head;
2195 Uint2 entityID;
2196 SaVal sv;
2197 Boolean success=TRUE;
2198
2199 entityID = ObjMgrGetEntityIDForChoice (sep);
2200 if (entityID > 0) {
2201 sep_head = GetTopSeqEntryForEntityID (entityID);
2202 if (sep_head != NULL) {
2203 sv.message = message;
2204 sv.msg_success = msg_success;
2205 sv.find_remote_bsp = find_remote_bsp;
2206 sv.find_acc_bsp = FALSE;
2207 sv.delete_salp = delete_salp;
2208 sv.delete_bsp = delete_bsp;
2209 sv.retdel = TRUE;
2210 sv.do_hist_assembly = do_hist_assembly;
2211 sv.ids = NULL;
2212 sv.entityID = entityID;
2213 sv.dirty = FALSE;
2214 SeqEntryExplore (sep_head, (Pointer)&sv, ValidateSeqAlignCallback);
2215 if (sv.dirty) {
2216 ObjMgrSetDirtyFlag (entityID, TRUE);
2217 ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0);
2218 }
2219 success = sv.retdel;
2220 }
2221 }
2222 return success;
2223 }
2224
2225
2226 /* alignment validator private for regular validator */
2227
2228 NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep, Boolean find_remote_bsp, Boolean do_hist_assembly);
2229
2230 NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep, Boolean find_remote_bsp, Boolean do_hist_assembly)
2231
2232 {
2233 GatherContext gc;
2234 Boolean rsult;
2235
2236 if (vsp == NULL || sep == NULL) return FALSE;
2237 useLockByID = vsp->farIDsInAlignments;
2238 useValErr = TRUE;
2239 useVsp = vsp;
2240 vsp->gcp = &gc;
2241 vsp->bssp = NULL;
2242 vsp->bsp = NULL;
2243 vsp->sfp = NULL;
2244 vsp->descr = NULL;
2245 MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
2246 rsult = ValidateSeqAlignInSeqEntry (sep, FALSE, FALSE, find_remote_bsp, FALSE, FALSE, do_hist_assembly);
2247 useLockByID = TRUE;
2248 useValErr = FALSE;
2249 useVsp = NULL;
2250 return rsult;
2251 }
2252
2253
2254 /* PopulateSample and ReadFromAlignmentSample are utility functions for AlignmentPercentIdentity */
2255 static void PopulateSample (Uint1Ptr seqbuf_list, Int4Ptr start_list,
2256 Int4 sample_len, BioseqPtr PNTR bsp_list,
2257 Int4 row)
2258 {
2259 Char ch;
2260
2261 if (seqbuf_list == NULL || start_list == NULL || sample_len < 1 || row < 0 || bsp_list == NULL
2262 || bsp_list[row] == NULL || start_list[row] < 0 || start_list[row] >= bsp_list[row]->length) {
2263 return;
2264 }
2265
2266 ch = *(seqbuf_list + (row + 1) * sample_len);
2267
2268 SeqPortStreamInt (bsp_list[row],
2269 start_list[row],
2270 MIN (start_list[row] + sample_len - 1, bsp_list[row]->length - 1),
2271 Seq_strand_plus,
2272 0,
2273 seqbuf_list + row * sample_len,
2274 NULL);
2275
2276 /* put back char overwritten by SeqPortStreamInt */
2277 *(seqbuf_list + (row + 1) * sample_len) = ch;
2278
2279 }
2280
2281
2282 static Uint1 ComplementChar (Uint1 ch)
2283 {
2284 if (ch == 'A') {
2285 return 'T';
2286 } else if (ch == 'T') {
2287 return 'A';
2288 } else if (ch == 'G') {
2289 return 'C';
2290 } else if (ch == 'C') {
2291 return 'G';
2292 } else {
2293 return ch;
2294 }
2295 }
2296
2297 static Uint1 ReadFromAlignmentSample(Uint1Ptr seqbuf_list, Int4Ptr start_list,
2298 Int4 sample_len, BioseqPtr PNTR bsp_list,
2299 Uint1Ptr strand_list,
2300 Int4 row, Int4 seq_pos)
2301 {
2302 Uint1 ch = 0;
2303
2304 if (seqbuf_list == NULL || start_list == NULL || sample_len < 1 || row < 0 || bsp_list == NULL
2305 || bsp_list[row] == NULL || seq_pos < 0 || seq_pos >= bsp_list[row]->length) {
2306 return 0;
2307 }
2308
2309 if (seq_pos < start_list[row] || seq_pos >= start_list[row] + sample_len) {
2310 start_list[row] = (seq_pos / sample_len) * sample_len;
2311 PopulateSample (seqbuf_list, start_list,
2312 sample_len, bsp_list,
2313 row);
2314 }
2315 ch = seqbuf_list[(row * sample_len) + seq_pos - start_list[row]];
2316 if (strand_list[row] == Seq_strand_minus) {
2317 ch = ComplementChar(ch);
2318 }
2319 return ch;
2320 }
2321
2322 typedef struct ambchar {
2323 Char ambig_char;
2324 CharPtr match_list;
2325 } AmbCharData, PNTR AmbCharPtr;
2326
2327 static const AmbCharData ambiguity_list[] = {
2328 { 'R', "AG" },
2329 { 'Y', "CT" },
2330 { 'M', "AC" },
2331 { 'K', "GT" },
2332 { 'S', "CG" },
2333 { 'W', "AT" },
2334 { 'H', "ACT" },
2335 { 'B', "CGT" },
2336 { 'V', "ACG" },
2337 { 'D', "AGT" }};
2338
2339 static const Int4 num_ambiguities = sizeof (ambiguity_list) / sizeof (AmbCharData);
2340
2341 static Char AmbiguousMatch (Char ch1, Char ch2)
2342 {
2343 Int4 i;
2344 for (i = 0; i < num_ambiguities; i++) {
2345 if (ch1 == ambiguity_list[i].ambig_char
2346 && StringChr (ambiguity_list[i].match_list, ch2)) {
2347 return ch2;
2348 } else if (ch2 == ambiguity_list[i].ambig_char
2349 && StringChr (ambiguity_list[i].match_list, ch1)) {
2350 return ch1;
2351 }
2352 }
2353 return 0;
2354 }
2355
2356
2357 extern double *
2358 GetAlignmentColumnPercentIdentities
2359 (SeqAlignPtr salp,
2360 Int4 start,
2361 Int4 stop,
2362 Boolean internal_gaps,
2363 Boolean internal_validation)
2364 {
2365 Int4 aln_len, num_rows, row, col_count = 0;
2366 Int4 num_match;
2367 Int4 aln_pos, seq_pos, k;
2368 Uint1 row_ch;
2369 SeqEntryPtr oldscope;
2370 SeqIdPtr PNTR sip_list;
2371 BioseqPtr PNTR bsp_list;
2372 Uint1Ptr strand_list;
2373 BoolPtr start_gap, end_gap;
2374 Int4Ptr start_list;
2375 Uint1Ptr seqbuf_list;
2376 Int4 sample_len = 50;
2377 Int4 chars_appearing[5]; /* 0 is A, 1 is T, 2 is G, 3 is C, 4 is internal gap */
2378 Int4 max_app, total_app, i;
2379 double * pct_ids;
2380
2381 if (salp == NULL || start < 0 || stop < start) return NULL;
2382
2383 AlnMgr2IndexSingleChildSeqAlign(salp);
2384 aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2385 num_rows = AlnMgr2GetNumRows(salp);
2386 if (num_rows < 0) {
2387 Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2388 return NULL;
2389 }
2390
2391 pct_ids = (double *) MemNew (sizeof (double) * (stop - start + 1));
2392 MemSet (pct_ids, 0, sizeof (double) * (stop - start + 1));
2393
2394 bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2395 sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2396 strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2397 start_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2398 end_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2399 for (row = 1; row <= num_rows; row++) {
2400 sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2401 strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2402 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2403 if (bsp_list[row - 1] == NULL) {
2404 oldscope = SeqEntrySetScope (NULL);
2405 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2406 SeqEntrySetScope(oldscope);
2407 if (bsp_list[row - 1] == NULL) {
2408 break;
2409 }
2410 }
2411 start_gap[row - 1] = TRUE;
2412 end_gap[row - 1] = FALSE;
2413 }
2414
2415 if (row <= num_rows) {
2416 Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2417 while (row >= 0) {
2418 sip_list[row] = SeqIdFree(sip_list[row]);
2419 BioseqUnlock(bsp_list[row]);
2420 row--;
2421 }
2422 sip_list = MemFree (sip_list);
2423 bsp_list = MemFree (bsp_list);
2424 start_gap = MemFree (start_gap);
2425 end_gap = MemFree (end_gap);
2426 return 0;
2427 }
2428
2429 start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2430 seqbuf_list = (Uint1Ptr) MemNew (num_rows * sample_len * sizeof(Uint1));
2431 for (row = 0; row < num_rows; row++) {
2432 start_list[row] = 0;
2433 PopulateSample (seqbuf_list, start_list,
2434 sample_len, bsp_list,
2435 row);
2436 }
2437
2438 num_match = 0;
2439 for (aln_pos = start; aln_pos < aln_len && aln_pos <= stop; aln_pos++) {
2440 /* init lists */
2441 MemSet (chars_appearing, 0, sizeof (chars_appearing));
2442 for (row = 1; row <= num_rows; row++) {
2443 if (end_gap[row - 1]) {
2444 continue;
2445 }
2446 seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2447 if (seq_pos < 0) {
2448 if (start_gap[row - 1] || end_gap[row - 1]) {
2449 /* beginning/end gap - never counts against percent identity */
2450 } else {
2451 k = aln_pos + 1;
2452 while (k < aln_len && seq_pos < 0) {
2453 seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, k, row);
2454 k++;
2455 }
2456 if (seq_pos < 0) {
2457 /* now in end_gap for this sequence */
2458 end_gap[row - 1] = TRUE;
2459 } else {
2460 /* internal gaps count against percent identity when specified */
2461 if (internal_gaps) {
2462 chars_appearing[4] ++;
2463 }
2464 }
2465 }
2466 } else {
2467 start_gap[row - 1] = FALSE;
2468
2469 row_ch = ReadFromAlignmentSample(seqbuf_list, start_list,
2470 sample_len, bsp_list, strand_list,
2471 row - 1, seq_pos);
2472 switch (row_ch) {
2473 case 'A':
2474 chars_appearing[0]++;
2475 break;
2476 case 'T':
2477 chars_appearing[1]++;
2478 break;
2479 case 'G':
2480 chars_appearing[2]++;
2481 break;
2482 case 'C':
2483 chars_appearing[3]++;
2484 break;
2485 default:
2486 /* we don't count ambiguity characters */
2487 break;
2488 }
2489 }
2490 }
2491 max_app = 0;
2492 total_app = 0;
2493 for (i = 0; i < 4; i++) {
2494 if (chars_appearing[i] > max_app) {
2495 max_app = chars_appearing[i];
2496 }
2497 total_app += chars_appearing[i];
2498 }
2499 /* add in internal gaps */
2500 total_app += chars_appearing[4];
2501 if (total_app > 0) {
2502 pct_ids[aln_pos - start] = (double) max_app / (double) total_app;
2503 }
2504 col_count++;
2505 }
2506
2507 for (row = 0; row < num_rows; row++) {
2508 sip_list[row] = SeqIdFree(sip_list[row]);
2509 BioseqUnlock(bsp_list[row]);
2510 }
2511 sip_list = MemFree (sip_list);
2512 bsp_list = MemFree (bsp_list);
2513 start_gap = MemFree (start_gap);
2514 end_gap = MemFree (end_gap);
2515 start_list = MemFree (start_list);
2516 seqbuf_list = MemFree (seqbuf_list);
2517
2518 return pct_ids;
2519 }
2520
2521
2522 static Uint2 AlignmentPercentIdentityEx (SeqAlignPtr salp, Boolean internal_gaps, Boolean internal_validation)
2523 {
2524 Int4 aln_len, num_rows, row, col_count = 0;
2525 Int4 num_match;
2526 Uint2 pcnt;
2527 Boolean row_match;
2528 Int4 aln_pos, seq_pos, tmp;
2529 Uint1 seq_ch, row_ch, amb_match;
2530 SeqEntryPtr oldscope;
2531 SeqIdPtr PNTR sip_list;
2532 BioseqPtr PNTR bsp_list;
2533 Uint1Ptr strand_list;
2534 Int4Ptr start_list;
2535 Uint1Ptr seqbuf_list;
2536 Int4 sample_len = 50;
2537 Int4Ptr starts, stops;
2538
2539 if (salp == NULL) return 0;
2540
2541 AlnMgr2IndexSingleChildSeqAlign(salp);
2542 aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2543 num_rows = AlnMgr2GetNumRows(salp);
2544 if (num_rows < 0) {
2545 if (! internal_validation) {
2546 Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2547 }
2548 return 0;
2549 }
2550 bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2551 sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2552 strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2553 starts = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
2554 stops = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
2555 for (row = 1; row <= num_rows; row++) {
2556 sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2557 strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2558 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2559 if (bsp_list[row - 1] == NULL) {
2560 oldscope = SeqEntrySetScope (NULL);
2561 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2562 SeqEntrySetScope(oldscope);
2563 if (bsp_list[row - 1] == NULL) {
2564 break;
2565 }
2566 }
2567 /* get endpoints for each row */
2568 AlnMgr2GetNthSeqRangeInSA(salp, row, starts + row - 1, stops + row - 1);
2569 starts[row - 1] = AlnMgr2MapBioseqToSeqAlign (salp, starts[row - 1], row);
2570 stops[row - 1] = AlnMgr2MapBioseqToSeqAlign (salp, stops[row - 1], row);
2571 if (starts[row - 1] > stops[row - 1]) {
2572 tmp = starts[row - 1];
2573 starts[row - 1] = stops[row - 1];
2574 stops[row - 1] = tmp;
2575 }
2576
2577 }
2578
2579 if (row <= num_rows) {
2580 if (! internal_validation) {
2581 Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2582 }
2583 while (row > 0) {
2584 sip_list[row - 1] = SeqIdFree(sip_list[row - 1]);
2585 BioseqUnlock(bsp_list[row - 1]);
2586 row--;
2587 }
2588 sip_list = MemFree (sip_list);
2589 bsp_list = MemFree (bsp_list);
2590 starts = MemFree (starts);
2591 stops = MemFree (stops);
2592 return 0;
2593 }
2594
2595 start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2596 seqbuf_list = (Uint1Ptr) MemNew ((num_rows * sample_len + 1) * sizeof(Uint1));
2597 for (row = 0; row < num_rows; row++) {
2598 start_list[row] = 0;
2599 PopulateSample (seqbuf_list, start_list,
2600 sample_len, bsp_list,
2601 row);
2602 }
2603
2604 num_match = 0;
2605 for (aln_pos = 0; aln_pos < aln_len; aln_pos++) {
2606 row_match = TRUE;
2607 seq_ch = 0;
2608 for (row = 1; row <= num_rows; row++) {
2609 if (aln_pos < starts[row - 1] || aln_pos > stops[row - 1]) {
2610 continue;
2611 }
2612 seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2613 if (seq_pos < 0) {
2614 if (internal_gaps) {
2615 row_match = FALSE;
2616 }
2617 } else {
2618 row_ch = ReadFromAlignmentSample(seqbuf_list, start_list,
2619 sample_len, bsp_list, strand_list,
2620 row - 1, seq_pos);
2621 if (row_ch == 'N') {
2622 /* do nothing - Ns do not count against percent identity */
2623 } else if (seq_ch == 0) {
2624 seq_ch = row_ch;
2625 } else if (seq_ch != row_ch) {
2626 amb_match = AmbiguousMatch (seq_ch, row_ch);
2627 if (amb_match == 0) {
2628 row_match = FALSE;
2629 } else {
2630 seq_ch = amb_match;
2631 }
2632 }
2633 }
2634 }
2635 if (row_match) {
2636 num_match++;
2637 }
2638 col_count++;
2639 }
2640
2641 for (row = 0; row < num_rows; row++) {
2642 sip_list[row] = SeqIdFree(sip_list[row]);
2643 BioseqUnlock(bsp_list[row]);
2644 }
2645 sip_list = MemFree (sip_list);
2646 bsp_list = MemFree (bsp_list);
2647 starts = MemFree (starts);
2648 stops = MemFree (stops);
2649 start_list = MemFree (start_list);
2650 seqbuf_list = MemFree (seqbuf_list);
2651
2652 if (col_count == 0) {
2653 pcnt = 0;
2654 } else {
2655 pcnt = (100 * num_match) / col_count;
2656 }
2657 return pcnt;
2658 }
2659
2660 extern Uint2 AlignmentPercentIdentity (SeqAlignPtr salp, Boolean internal_gaps)
2661 {
2662 return AlignmentPercentIdentityEx (salp, internal_gaps, FALSE);
2663 }
2664
2665 extern Uint2 WeightedAlignmentPercentIdentity (SeqAlignPtr salp, Boolean internal_gaps)
2666 {
2667 Int4 aln_len, num_rows, row, col_count = 0;
2668 Int4 num_match;
2669 Uint2 pcnt;
2670 Int4 aln_pos, seq_pos, k;
2671 Uint1 row_ch;
2672 SeqEntryPtr oldscope;
2673 SeqIdPtr PNTR sip_list;
2674 BioseqPtr PNTR bsp_list;
2675 Uint1Ptr strand_list;
2676 BoolPtr start_gap, end_gap;
2677 Int4Ptr start_list;
2678 Uint1Ptr seqbuf_list;
2679 Int4 sample_len = 50;
2680 Int4 chars_appearing[5]; /* 0 is A, 1 is T, 2 is G, 3 is C, 4 is internal gap */
2681 double col_pct, col_pct_total = 0;
2682 Int4 max_app, total_app, i;
2683
2684 if (salp == NULL) return 0;
2685
2686 AlnMgr2IndexSingleChildSeqAlign(salp);
2687 aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2688 num_rows = AlnMgr2GetNumRows(salp);
2689 if (num_rows < 0) {
2690 Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2691 return 0;
2692 }
2693 bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2694 sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2695 strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2696 start_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2697 end_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2698 for (row = 1; row <= num_rows; row++) {
2699 sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2700 strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2701 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2702 if (bsp_list[row - 1] == NULL) {
2703 oldscope = SeqEntrySetScope (NULL);
2704 bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2705 SeqEntrySetScope(oldscope);
2706 if (bsp_list[row - 1] == NULL) {
2707 break;
2708 }
2709 }
2710 start_gap[row - 1] = TRUE;
2711 end_gap[row - 1] = FALSE;
2712 }
2713
2714 if (row <= num_rows) {
2715 Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2716 while (row >= 0) {
2717 sip_list[row] = SeqIdFree(sip_list[row]);
2718 BioseqUnlock(bsp_list[row]);
2719 row--;
2720 }
2721 sip_list = MemFree (sip_list);
2722 bsp_list = MemFree (bsp_list);
2723 start_gap = MemFree (start_gap);
2724 end_gap = MemFree (end_gap);
2725 return 0;
2726 }
2727
2728 start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2729 seqbuf_list = (Uint1Ptr) MemNew (num_rows * sample_len * sizeof(Uint1));
2730 for (row = 0; row < num_rows; row++) {
2731 start_list[row] = 0;
2732 PopulateSample (seqbuf_list, start_list,
2733 sample_len, bsp_list,
2734 row);
2735 }
2736
2737 num_match = 0;
2738 for (aln_pos = 0; aln_pos < aln_len; aln_pos++) {
2739 /* init lists */
2740 MemSet (chars_appearing, 0, sizeof (chars_appearing));
2741 for (row = 1; row <= num_rows; row++) {
2742 if (end_gap[row - 1]) {
2743 continue;
2744 }
2745 seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2746 if (seq_pos < 0) {
2747 if (start_gap[row - 1] || end_gap[row - 1]) {
2748 /* beginning/end gap - never counts against percent identity */
2749 } else {
2750 k = aln_pos + 1;
2751 while (k < aln_len && seq_pos < 0) {
2752 seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, k, row);
2753 k++;
2754 }
2755 if (seq_pos < 0) {
2756 /* now in end_gap for this sequence */
2757 end_gap[row - 1] = TRUE;
2758 } else {
2759 /* internal gaps count against percent identity when specified */
2760 if (internal_gaps) {
2761 chars_appearing[4] ++;
2762 }
2763 }
2764 }
2765 } else {
2766 start_gap[row - 1] = FALSE;
2767
2768 row_ch = ReadFromAlignmentSample(seqbuf_list, start_list,
2769 sample_len, bsp_list, strand_list,
2770 row - 1, seq_pos);
2771 switch (row_ch) {
2772 case 'A':
2773 chars_appearing[0]++;
2774 break;
2775 case 'T':
2776 chars_appearing[1]++;
2777 break;
2778 case 'G':
2779 chars_appearing[2]++;
2780 break;
2781 case 'C':
2782 chars_appearing[3]++;
2783 break;
2784 default:
2785 /* we don't count ambiguity characters */
2786 break;
2787 }
2788 }
2789 }
2790 max_app = 0;
2791 total_app = 0;
2792 for (i = 0; i < 4; i++) {
2793 if (chars_appearing[i] > max_app) {
2794 max_app = chars_appearing[i];
2795 }
2796 total_app += chars_appearing[i];
2797 }
2798 if (total_app > 0) {
2799 col_pct = (double) max_app / (double) total_app;
2800 col_pct_total += col_pct;
2801 }
2802 col_count++;
2803 }
2804
2805 for (row = 0; row < num_rows; row++) {
2806 sip_list[row] = SeqIdFree(sip_list[row]);
2807 BioseqUnlock(bsp_list[row]);
2808 }
2809 sip_list = MemFree (sip_list);
2810 bsp_list = MemFree (bsp_list);
2811 start_gap = MemFree (start_gap);
2812 end_gap = MemFree (end_gap);
2813 start_list = MemFree (start_list);
2814 seqbuf_list = MemFree (seqbuf_list);
2815
2816 if (col_count == 0) {
2817 pcnt = 0;
2818 } else {
2819 pcnt = (100 * col_pct_total) / col_count;
2820 }
2821 return pcnt;
2822 }
2823
2824 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |