NCBI C Toolkit Cross Reference

C/api/alignval.c


  1 /*  alignval.c
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *            National Center for Biotechnology Information (NCBI)
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's official duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government do not place any restriction on its use or reproduction.
 13 *  We would, however, appreciate having the NCBI and the author cited in
 14 *  any work or product based on this material
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 * ===========================================================================
 25 *
 26 * File Name:  alignval.c
 27 *
 28 * Author:  Jian Ye, Colombe Chappey
 29 *
 30 * Version Creation Date:   6/3/99
 31 *
 32 * $Revision: 6.73 $
 33 *
 34 * File Description:  To validate sequence alignment.
 35 *
 36 * Modifications:  
 37 * --------------------------------------------------------------------------
 38 * Date     Name        Description of modification
 39 * -------  ----------  -----------------------------------------------------
 40 *
 41 *
 42 * ==========================================================================
 43 */
 44 
 45  
 46 #include <ncbi.h>
 47 #include <seqmgr.h>
 48 #include <objmgr.h>
 49 #include <sequtil.h> 
 50 #include <sqnutils.h>
 51 #include <satutil.h>
 52 #include <salsap.h>
 53 #include <txalign.h>
 54 #include <salpacc.h>
 55 #include <alignval.h>
 56 #include <valid.h>
 57 #include <alignmgr2.h>
 58 
 59 
 60 Uint1  jybitnum[8]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
 61 
 62 typedef struct saval {
 63   Boolean     message;
 64   Boolean     msg_success;
 65   Boolean     find_remote_bsp;
 66   Boolean     find_acc_bsp;
 67   Boolean     delete_salp;
 68   Boolean     delete_bsp;
 69   Boolean     retdel;
 70   Boolean     do_hist_assembly;
 71   ValNodePtr  ids;
 72   Uint2       entityID;
 73   Boolean     dirty;
 74 } SaVal, PNTR SaValPtr;
 75 
 76 typedef struct JY_error_msg {
 77         Uint1 level;/* corresponds to levels of ErrPostEx [none(0), info(1), war
 78 n(2), error(3) and fatal(4)] */
 79         CharPtr msg;
 80 } JYErrorMsg, *JYErrorMsgPtr;
 81 
 82 /******************************************************************
 83 ***
 84 *** Error Messaging
 85 ***    copies of the BLASt functions in blastpri.h
 86 ***    JYConstructErrorMessage = BlastConstructErrorMessage
 87 ***    JYErrorChainDestroy = BlastErrorChainDestroy
 88 ***
 89 ******************************************************************/ 
 90 
 91 static ValNodePtr errorp = NULL;
 92 #define BUFFER_LENGTH 512
 93 
 94 static Uint2 AlignmentPercentIdentityEx (SeqAlignPtr salp, Boolean internal_gaps, Boolean internal_validation);
 95 
 96 static ValNodePtr JYConstructErrorMessage (CharPtr function, CharPtr message, Uint1 level, ValNodePtr PNTR vnpp)
 97 {
 98         Char buffer[BUFFER_LENGTH];
 99         CharPtr ptr;
100         JYErrorMsgPtr error_msg;
101 
102         if (vnpp == NULL)
103                 return NULL;
104 
105         buffer[0] = NULLB;
106         ptr = buffer;
107         if (function != NULL)
108         {
109                 sprintf(buffer, "%s: ", function);
110                 ptr = buffer + StringLen(buffer);
111         }
112 
113         if (message != NULL)
114         {
115                 sprintf(ptr, "%s", message);
116         }
117 
118         error_msg = (JYErrorMsgPtr) MemNew(sizeof(JYErrorMsg));
119         error_msg->msg = StringSave(buffer);
120         error_msg->level = level;
121 
122         ValNodeAddPointer(vnpp, 0, error_msg);
123 
124         return *vnpp;
125 }
126 
127 static ValNodePtr JYErrorChainDestroy (ValNodePtr vnp)
128 
129 {
130         ValNodePtr start = vnp;
131         JYErrorMsgPtr error_msg;
132 
133         while (vnp)
134         {
135            error_msg = (JYErrorMsgPtr) vnp->data.ptrvalue;
136            if (error_msg != NULL) {
137               MemFree(error_msg->msg);
138            }
139            vnp->data.ptrvalue = MemFree(vnp->data.ptrvalue);
140            vnp = vnp->next;
141         }
142 
143         ValNodeFree(start);
144 
145         return NULL;
146 }
147 /******************************************************************
148 Output error message according to code defined in alignval.h.  
149 id refers to seqid of the sequence that causes the error 
150 and idcontext refers to other sequences in the same segment.  
151 Intvalue is used to indicate 1) the segment where the sequence 
152 with error is, or 2) the segtype in case of segtype error.  
153 Please note that not all errors report all three 
154 parameters(id, idcontext, Intvalue)
155 ******************************************************************/ 
156 
157 static Boolean useValErr = FALSE;
158 static Boolean useLockByID = FALSE;
159 static ValidStructPtr useVsp = NULL;
160 
161 static BioseqPtr AlignValBioseqLockById (SeqIdPtr sid)
162 
163 {
164   Int4 old_sev;
165   BioseqPtr bsp = NULL;
166 
167   if (useLockByID) {
168     old_sev = ErrSetMessageLevel (SEV_WARNING);
169     bsp = BioseqLockById (sid);
170     ErrSetMessageLevel ((ErrSev) old_sev);
171   } else {
172     bsp = BioseqFindCore (sid);
173   }
174   return bsp;
175 }
176 
177 static Boolean AlignValBioseqUnlock (BioseqPtr bsp)
178 
179 {
180   if (useLockByID) {
181     return BioseqUnlock (bsp);
182   } else {
183     return TRUE;
184   }
185 }
186 
187 NLM_EXTERN void CDECL  ValidErr VPROTO((ValidStructPtr vsp, int severity, int code1, int code2, const char *fmt, ...));
188 
189 /*****************************************************************
190 *  get the approximate sequence coordinate for an alignment segment
191 *  sip == NULL -> get alignment coordinate
192 *****************************************************************/
193 static Int4 valmsggetseqpos(SeqAlignPtr sap, Int4 segment, SeqIdPtr sip)
194 {
195    Int4          c;
196    DenseDiagPtr  ddp;
197    DenseSegPtr   dsp;
198    Boolean       found;
199    Int4          i;
200    Int4          j;
201    Int4          pos;
202    PackSegPtr    psp;
203    Uint1Ptr      seqpresence;
204    SeqIdPtr      sip_tmp;
205    SeqLocPtr     slp;
206    StdSegPtr     ssp;
207 
208    if (sap == NULL || sap->segs == NULL || segment == 0)
209       return -1;
210    if (sap->segtype == SAS_DENSEG)
211    {
212       dsp = (DenseSegPtr)sap->segs;
213       if (sip == NULL)
214       {
215          pos = 0;
216          for (c=0; c<segment; c++)
217          {
218             pos += dsp->lens[c];
219          }
220          return pos;
221       }
222       sip_tmp = dsp->ids;
223       i = 0;
224       found = FALSE;
225       while (!found && sip_tmp != NULL)
226       {
227          if (SeqIdComp(sip, sip_tmp) == SIC_YES)
228             found = TRUE;
229          else
230          {
231             sip_tmp = sip_tmp->next;
232             i++;
233          }
234       }
235       if (!found || i>dsp->dim || segment > dsp->numseg)
236          return -1;
237       pos = 0;
238       for (c=0; c<segment; c++)
239       {
240          if ((j = dsp->starts[(dsp->dim*c)+i])>0)
241             pos=j;
242       }
243       return pos;
244    } else if (sap->segtype == SAS_DENDIAG)
245    {
246       ddp = (DenseDiagPtr)sap->segs;
247       pos = 0;
248       for (c=0; c<segment; c++)
249       {
250          pos += ddp->len;
251          ddp = ddp->next;
252          if (ddp == NULL)
253             return -1;
254       }
255       if (sip == NULL)
256          return pos;
257       sip_tmp = ddp->id;
258       i = 0;
259       found = FALSE;
260       while (!found && sip_tmp != NULL)
261       {
262          if (SeqIdComp(sip, sip_tmp) == SIC_YES)
263             found = TRUE;
264          else
265          {
266             sip_tmp = sip_tmp->next;
267             i++;
268          }
269       }
270       if (!found || i>ddp->dim)
271          return -1;
272       return (ddp->starts[i]);
273    } else if (sap->segtype == SAS_STD)
274    {
275       ssp = (StdSegPtr)(sap->segs);
276       pos = 0;
277       for (c=0; c<segment-1; c++)
278       {
279          pos += SeqLocLen(ssp->loc);
280          ssp = ssp->next;
281          if (ssp == NULL)
282             return -1;
283       }
284       if (sip == NULL)
285          return pos;
286       slp = ssp->loc;
287       found = FALSE;
288       while (!found && slp!=NULL)
289       {
290          sip_tmp = SeqLocId(slp);
291          if (SeqIdComp(sip, sip_tmp) == SIC_YES)
292             found = TRUE;
293          else
294             slp = slp->next;
295       }
296       if (!found)
297          return -1;
298       return (SeqLocStart(slp));
299    } else if (sap->segtype == SAS_PACKED)
300    {
301       psp = (PackSegPtr)(sap->segs);
302       if (segment > psp->numseg)
303          return -1;
304       if (sip == NULL)
305       {
306          pos = 0;
307          for (c=0; c<segment; c++)
308          {
309             pos += psp->lens[c];
310          }
311          return pos;
312       }
313       sip_tmp = psp->ids;
314       i = 0;
315       found = FALSE;
316       while (!found && sip_tmp != NULL)
317       {
318          if (SeqIdComp(sip, sip_tmp) == SIC_YES)
319             found = TRUE;
320          else
321          {
322             sip_tmp = sip_tmp->next;
323             i++;
324          }
325       }
326       if (!found || i>psp->dim)
327          return -1;
328       pos = 0;
329       seqpresence = NULL;
330       BSSeek(psp->present, 0, SEEK_SET);
331       seqpresence=MemNew(BSLen(psp->present));
332       if(!seqpresence)
333          return -1;
334       BSRead(psp->present, seqpresence, BSLen(psp->present));
335       for (c=0; c<segment; c++)
336       {
337          if (seqpresence[(c*psp->numseg+i)/8]&jybitnum[(c*psp->numseg+i)%8])
338             pos+=psp->lens[c];
339       }
340       return pos;
341    } else
342       return -1;
343 }
344 
345 
346 static BioseqPtr BioseqForAlignment (SeqAlignPtr salp)
347 {
348   Int4 row, num_rows;
349   BioseqPtr bsp = NULL;
350   SeqIdPtr  sip;
351   SeqEntryPtr oldscope;
352   DenseDiagPtr ddp;
353   
354   oldscope = SeqEntrySetScope (NULL);
355   /* NOTE - can't index DenseDiag chain during validation because we're examining the individual DenseDiags,
356    * and indexing converts it to DenseSegs.
357    */
358   if (salp->segtype == SAS_DENDIAG && salp->segs != NULL) {
359     ddp = (DenseDiagPtr) salp->segs;
360     while (bsp == NULL && ddp != NULL) {
361       for (sip = ddp->id; bsp == NULL && sip != NULL; sip = sip->next) {
362         bsp = BioseqFind (sip);
363         sip = sip->next;
364       }
365       ddp = ddp->next;
366     }
367   } else {
368     AlnMgr2IndexSingleChildSeqAlign(salp);
369     num_rows = AlnMgr2GetNumRows(salp);
370     for (row = 1; row <= num_rows && bsp == NULL; row++) {
371       sip = AlnMgr2GetNthSeqIdPtr(salp, row);
372       bsp = BioseqFind(sip);
373     }
374   }
375   SeqEntrySetScope (oldscope);  
376   return bsp;
377 }
378 
379 
380 static void ValMessage (SeqAlignPtr salp, Int1 MessageCode, ErrSev errlevel, SeqIdPtr id, SeqIdPtr idcontext , Int4 Intvalue) 
381 {
382   
383   Char     buf[256], 
384            buf3[64],
385            string1[64],
386            string2[552];
387   GatherContextPtr gcp;
388   Int4     pos;
389 
390   string1[0] = '\0';
391   string2[0] = '\0';
392   SeqIdWrite(id, buf, PRINTID_FASTA_LONG, sizeof(buf)-1);
393   switch(MessageCode)
394   {
395     case Err_SeqId:
396       sprintf(string1, "SeqId");
397       sprintf(string2, "The sequence corresponding to SeqId %s could not be found", buf);
398       break;
399 
400     case Err_Strand_Rev:
401       pos = valmsggetseqpos(salp, Intvalue, id);
402       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
403       sprintf(string1, "Strand");
404       sprintf(string2, "The strand labels for SeqId %s are inconsistent across the alignment; the first inconsistent region is the %ld(th) region, near sequence position %ld, context %s", buf, (long) Intvalue, (long) pos, buf3);
405       break;
406 
407     case Err_Denseg_Len_Start:
408       pos = valmsggetseqpos(salp, Intvalue, id);
409       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
410       sprintf(string1, "Start/Length");
411       sprintf(string2, "There is a problem with sequence %s, in segment %ld (near sequence position %ld), context %s: the segment is too long or short or the next segment has an incorrect start position", buf, (long) Intvalue, (long) pos, buf3);
412       break;
413 
414     case  Err_Start_Less_Than_Zero:
415       pos = valmsggetseqpos(salp, Intvalue, id);
416       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
417       sprintf(string1, "Start");
418       sprintf(string2, "Start point is less than zero in segment %ld (near sequence position %ld) for sequence ID: %s in the context of %s", (long) Intvalue, (long) pos, buf, buf3);
419       break;
420 
421     case Err_Start_More_Than_Biolen:
422       pos = valmsggetseqpos(salp, Intvalue, id);
423       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
424       sprintf(string1, "Start");
425       sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
426       break;
427 
428     case Err_End_Less_Than_Zero:
429       pos = valmsggetseqpos(salp, Intvalue, id);
430       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
431       sprintf(string1, "Length");
432       sprintf(string2, "End point is less than zero in segment %ld (near position %d) for sequence ID: %s in the context of %s.  This could be a formatting error", (long) Intvalue, (int) pos,buf, buf3);
433       break;
434 
435     case Err_End_More_Than_Biolen:
436       pos = valmsggetseqpos(salp, Intvalue, id);
437       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
438       sprintf(string1, "Length");
439       sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
440       break;
441 
442     case Err_Len_Less_Than_Zero:
443       pos = valmsggetseqpos(salp, Intvalue, id);
444       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
445       sprintf(string1, "Length");
446       sprintf(string2, "Segment length is less than zero in segment %ld (near sequence position %ld) for sequence ID: %s in the context of %s.  Look for extra characters in this segment or flanking segments", (long) Intvalue, (long) pos, buf, buf3); 
447       break;
448 
449     case Err_Len_More_Than_Biolen:
450       pos = valmsggetseqpos(salp, Intvalue, id);
451       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
452       sprintf(string1, "Length");
453       sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
454       break; 
455  
456     case Err_Sum_Len_Start:
457       pos = valmsggetseqpos(salp, Intvalue, id);
458       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
459       sprintf(string1, "Start");
460       sprintf(string2, "In sequence %s, segment %ld (near sequence position %ld) context %s, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment", buf, (long) Intvalue, (long) pos, buf3);
461       break;
462 
463     case Err_SeqAlign_DimSeqId_Not_Match:
464       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
465       sprintf(string1, "SeqId");
466       sprintf(string2, "The Seqalign has more or fewer ids than the number of rows in the alignment (context %s).  Look for possible formatting errors in the ids.", buf3);
467       break;
468 
469     case Err_Segs_DimSeqId_Not_Match:
470       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
471       sprintf(string1, "SeqId");
472       sprintf(string2, "In segment %ld, there are more or fewer rows than there are seqids (context %s).  Look for possible formatting errors in the ids.", (long) Intvalue, buf3);
473       break;
474 
475     case Err_Fastalike:
476       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
477       sprintf(string1, "Fasta");
478       sprintf(string2, "This may be a fasta-like alignment for SeqId: %s in the context of %s", buf, buf3); 
479       break;
480 
481     case Err_Null_Segs:
482       sprintf(string1, "Segs");
483       sprintf(string2, "This alignment is missing all segments.  This is a non-correctable error -- look for serious formatting problems.");
484       break;
485 
486     case Err_Segment_Gap:
487       pos = valmsggetseqpos(salp, Intvalue, id);
488       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
489       sprintf(string1, "Segs");
490       sprintf(string2, "Segment %ld (near alignment position %ld) in the context of %s contains only gaps.  Each segment must contain at least one actual sequence -- look for columns with all gaps and delete them.", (long) Intvalue + 1, (long) pos, buf3);
491       break;
492 
493     case Err_Segs_Dim_One:
494       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
495       sprintf(string1, "Segs");
496       sprintf(string2, "Segment %ld apparently has only one sequence.  Each portion of the alignment must have at least two sequences.  context %s", (long) Intvalue, buf3);
497       break;
498 
499     case Err_SeqAlign_Dim_One:
500       SeqIdWrite (idcontext, buf3, PRINTID_REPORT, sizeof (buf3));
501       sprintf(string1, "Dim");
502       sprintf(string2, "This seqalign apparently has only one sequence.  Each alignment must have at least two sequences.  context %s", buf3);
503       break;
504 
505     case Err_Segtype :
506       sprintf(string1, "Segs");
507       sprintf(string2, "This alignment has a undefined or unsupported Seqalign segtype %ld", (long) Intvalue);
508       break;
509       
510     case Err_Pcnt_ID :
511       sprintf(string1, "PercentIdentity");
512       sprintf(string2, "This alignment has a percent identity of %d%%", Intvalue);
513       break;
514 
515     case Err_Short_Aln:
516       sprintf(string1, "ShortAln");
517       sprintf(string2, "This alignment is shorter than at least one non-farpointer sequence.");
518       break;
519 
520     case Err_Unexpected_Alignment_Type:
521       sprintf(string1, "UnexpectedAlignmentType");
522       sprintf (string2, "This is not a DenseSeg alignment.");
523       break;
524 
525     default:
526       break;
527   }
528   if (useValErr) {
529     if (salp != NULL && useVsp != NULL) {
530       gcp = useVsp->gcp;
531       if (gcp != NULL) {
532           gcp->entityID = salp->idx.entityID;
533           gcp->itemID = salp->idx.itemID;
534           gcp->thistype = salp->idx.itemtype;
535 
536         useVsp->bsp = BioseqForAlignment(salp);
537         ValidErr (useVsp, errlevel, 6, MessageCode, "%s: %s", string1, string2);
538       }
539     }
540     return;
541   }
542   if (StringLen(string1) > 0)
543      errorp = JYConstructErrorMessage (string1, string2, errlevel, &errorp);
544 }
545 
546  
547 /******************************************************************
548 return the number of seqid
549 ******************************************************************/ 
550 static Int2 CountSeqIdInSip (SeqIdPtr sip)
551 {
552     Int2 numids=0;
553 
554      while(sip) 
555        { 
556      numids++;
557      sip=sip->next;
558        }
559      return numids;
560 }
561 
562 /*********************************************************/
563 static void delete_bioseqs (ValNodePtr ids, Uint2 entityID)
564 {
565   SeqEntryPtr  sep_top;
566   SeqEntryPtr  sep_del;
567   ValNodePtr   vnp;
568   SeqIdPtr     sip;
569   SeqLocPtr    slp;
570   BioseqPtr    bsp;
571   ObjMgrDataPtr  omdptop;
572   ObjMgrData     omdata;
573   Uint2          parenttype;
574   Pointer        parentptr;
575 
576   if (ids == NULL)
577      return;
578   sep_top = GetTopSeqEntryForEntityID (entityID);
579   SaveSeqEntryObjMgrData (sep_top, &omdptop, &omdata);
580   GetSeqEntryParent (sep_top, &parentptr, &parenttype);
581 
582   vnp=ids;
583   while (vnp!=NULL)
584   {
585      sip = (SeqIdPtr) vnp->data.ptrvalue;
586      if (sip!=NULL) {
587         slp = (SeqLocPtr)ValNodeNew (NULL);
588         slp->choice = SEQLOC_WHOLE;
589         slp->data.ptrvalue = sip;
590         bsp = GetBioseqGivenSeqLoc (slp, entityID);
591         if (bsp!=NULL) {
592            sep_del=GetBestTopParentForData (entityID, bsp);
593            RemoveSeqEntryFromSeqEntry (sep_top, sep_del, FALSE);
594         }
595         slp->data.ptrvalue = NULL;
596         SeqLocFree (slp);
597      }
598      vnp=vnp->next;
599   }
600   SeqMgrLinkSeqEntry (sep_top, parenttype, parentptr);
601   RestoreSeqEntryObjMgrData (sep_top, omdptop, &omdata);
602   RenormalizeNucProtSets (sep_top, TRUE);
603 
604   for (vnp=ids; vnp!=NULL; vnp=vnp->next) {
605      SeqIdFree ((SeqIdPtr) vnp->data.ptrvalue);
606      vnp->data.ptrvalue = NULL;
607   }
608   ValNodeFree (vnp);
609   return;
610 }
611 
612 
613 /******************************************************************
614 validate a SeqId
615 ******************************************************************/ 
616 static void ValidateSeqId (SeqIdPtr sip, SeqAlignPtr salp)
617 {
618   SeqIdPtr  siptemp=NULL, sipnext;
619   BioseqPtr bsp=NULL;
620   
621   for(siptemp=sip; siptemp!=NULL; siptemp=siptemp->next)
622   {
623     /*
624     bsp = AlignValBioseqLockById(siptemp);
625     if(!bsp)
626         ValMessage (salp, Err_SeqId, SEV_ERROR, siptemp, NULL, 0);
627     else
628         AlignValBioseqUnlockById(siptemp);
629     */
630     sipnext = siptemp->next;
631     siptemp->next = NULL;
632     bsp = BioseqFindCore (siptemp);
633     if (bsp == NULL && siptemp->choice == SEQID_LOCAL) {
634         ValMessage (salp, Err_SeqId, SEV_ERROR, siptemp, NULL, 0);
635     }
636     siptemp->next = sipnext;
637   }
638   return;
639 }
640 
641 /******************************************************************
642 return seqid for each seg.  
643 Note that a newly created seqid chain is returned for stdseg 
644 and you need to free the memory after you use it in this case
645 ******************************************************************/ 
646 static SeqIdPtr SeqIdInAlignSegs(Pointer segs, Uint1 segtype, SeqAlignPtr salp)
647 {
648 
649   SeqIdPtr sip=NULL;
650   StdSegPtr ssp;
651   DenseDiagPtr ddp;
652   DenseSegPtr dsp;
653   PackSegPtr psp;
654   SeqLocPtr slp=NULL, slptemp;
655 
656   if(!segs)
657   {
658       ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
659       return NULL;
660   }
661   if(segtype==1) 
662   { /* DenseDiag */
663       
664       ddp=(DenseDiagPtr)segs;    
665       sip=ddp->id;
666   }
667   else if (segtype==2)
668   { /* DenseSeg */
669       
670       dsp = (DenseSegPtr) segs;
671       sip=dsp->ids;
672   }
673   else if (segtype==3)
674   { /* StdSeg */
675       
676       ssp = (StdSegPtr)segs;
677       slp = ssp->loc;
678       /*make a new linked list of SeqId*/
679       for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
680         AddSeqId(&sip, SeqLocId(slptemp));
681       
682   }
683   else if(segtype==4)
684   { /* Packed Seg. Optimal for editing alignments */
685       
686       psp = (PackSegPtr)segs;
687       if (psp!=NULL)
688         sip = psp->ids;
689   }      
690   return sip;
691 }
692 
693  
694 /******************************************************************
695 validate SeqId in sequence alignment
696 ******************************************************************/ 
697 static void  ValidateSeqIdInSeqAlign (SeqAlignPtr salp)
698 {
699   SeqIdPtr sip=NULL;
700   Pointer segptr=NULL;
701   DenseDiagPtr ddp=NULL, ddptemp;
702   StdSegPtr    ssp=NULL, ssptemp;
703  
704 
705   if(salp)
706     {     
707       segptr=salp->segs;
708       if(!segptr)
709     ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
710       else
711     {
712 
713       /*densediag */
714       if(salp->segtype==1)
715         {
716           /*cast to appropriate pointer*/
717           ddp=(DenseDiagPtr)segptr;
718           for(ddptemp=ddp; ddptemp!=NULL; ddptemp=ddptemp->next)
719         {
720           
721           sip=SeqIdInAlignSegs((Pointer)ddptemp, salp->segtype, salp);    
722           ValidateSeqId(sip, salp);
723         }
724         }
725       
726       /*Stdseg*/
727       else if(salp->segtype==3)
728         {
729           /*cast to appropriate pointer*/
730           ssp=(StdSegPtr)segptr;
731           for(ssptemp=ssp; ssptemp!=NULL; ssptemp=ssptemp->next)
732         {
733           
734           sip=SeqIdInAlignSegs((Pointer)ssptemp, salp->segtype, salp);    
735           ValidateSeqId(sip, salp);
736           /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
737           SeqIdSetFree(sip);
738         }
739         }
740       
741       /*Denseseg, Packseg*/
742       else if(salp->segtype==2||salp->segtype==4)
743         {
744           
745           sip=SeqIdInAlignSegs(segptr, salp->segtype, salp);    
746           ValidateSeqId(sip, salp);
747         } 
748     }
749     }
750 }
751 
752 /******************************************************************
753 return true if  two sip are the same, false otherwise.  
754 Also return false if there is error in sip
755 ******************************************************************/ 
756 static Boolean SeqIdCmp (SeqIdPtr sip1, SeqIdPtr sip2)
757 {
758   Char buf1[256], buf2[256];
759  
760   if(!sip1||!sip2)
761     return FALSE;
762 
763   SeqIdWrite(sip1, buf1, PRINTID_FASTA_LONG, 255);
764   SeqIdWrite(sip2, buf2, PRINTID_FASTA_LONG, 255);
765   return(!StringCmp(buf1, buf2));
766  
767 }
768  
769 
770 /******************************************************************
771 return the strand for a seqloc with seqid=sip in a stdseg.  
772 Note, it returns 255 if null sip or ssp
773 ******************************************************************/ 
774 static Uint1 SeqLocStrandForSipInStdSeg (SeqIdPtr sip, StdSegPtr ssp, SeqAlignPtr salp)
775 {
776   SeqLocPtr slp, slptemp;
777   Uint1     strand=0;
778     
779   if(!sip||!ssp)
780     return (255);
781 
782   slp=ssp->loc;
783   for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
784   {
785       if(SeqIdCmp(sip, SeqLocId(slptemp)))
786     {
787       strand=SeqLocStrand(slptemp);
788       break;
789     }
790   }
791   return strand;
792 }
793 
794 
795 /******************************************************************
796 check if the  strand is consistent in Stdseg
797 ******************************************************************/ 
798 static void ValidateStrandInStdSeg(StdSegPtr ssp, SeqAlignPtr salp)
799 {
800   SeqIdPtr     sip=NULL,  sip_inseg=NULL;
801   Uint1           strand1=0, strand2=0;
802   StdSegPtr    ssptemp, ssptemp2, ssptemp3;
803   SeqLocPtr    slp, slptemp;
804   ValNodePtr   FinishedSip=NULL, temp;
805   Boolean      CheckedStatus;
806   Int4         start_numseg=0, end_numseg=0;
807   
808   if(!ssp)
809     ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
810   else
811     for(ssptemp=ssp; ssptemp!=NULL; ssptemp=ssptemp->next)
812       {
813     sip_inseg=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
814     start_numseg++;
815     slp=ssptemp->loc;
816     for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
817       {
818         
819         CheckedStatus=FALSE;
820         sip=SeqLocId(slptemp);
821         if(sip)
822           {
823         /*if a seqloc represented by a sip has been checked, set the checkedstatus flag to true so it will not be checked again*/
824         for(temp=FinishedSip; temp!=NULL; temp=temp->next)
825           {
826             if(SeqIdCmp(sip, temp->data.ptrvalue))
827               {
828             CheckedStatus=TRUE;
829             break;
830               }
831           }
832         /*seqloc not checked yet*/
833         if(!CheckedStatus)
834           {
835             
836             /*keep a record of  checked sip*/
837             ValNodeAddPointer(&FinishedSip, 0, sip);
838             end_numseg=start_numseg;
839             /*go through all segs to get at least two strand, if any, for this seqloc*/
840             for(ssptemp2=ssptemp; ssptemp2!=NULL; ssptemp2=ssptemp2->next, end_numseg++)
841               {
842             /*get the first defined strand */
843             strand1=SeqLocStrandForSipInStdSeg(sip, ssptemp2, salp);
844             
845             if(strand1!=0&&strand1!=255)
846               {
847                 ssptemp2=ssptemp2->next;
848                 break;
849               }
850             
851               }
852             
853             if(strand1!=0&&strand1!=255)
854               /*continue to get next strand */
855               for(ssptemp3=ssptemp2; ssptemp3!=NULL; ssptemp3=ssptemp3->next, end_numseg++)
856             {
857               strand2=SeqLocStrandForSipInStdSeg(sip, ssptemp3, salp);
858               if(strand2==0||strand2==255)
859                 continue;
860               
861               if(strand2!=0&&strand2!=255)
862                 /*strand should be same for a given seq*/ 
863                 if(strand1!=strand2)
864                   
865                   ValMessage (salp, Err_Strand_Rev, SEV_ERROR, sip, sip_inseg, end_numseg+1);
866             }
867             }
868           }
869       }
870     SeqIdSetFree(sip_inseg);
871     
872       }
873   
874   ValNodeFree(FinishedSip);
875 }
876  
877  
878 /******************************************************************
879 check if the  strand is consistent in Denseseg
880 ******************************************************************/ 
881 static void ValidateStrandInPack_DenseSeg(Pointer segs, Uint1 segtype, SeqAlignPtr salp)
882 { 
883   DenseSegPtr dsp=NULL;
884   PackSegPtr psp=NULL;
885   Int4         numseg, aligndim, dimnumseg, i, j, m;
886   SeqIdPtr     sip=NULL, siptemp;
887   Uint1           strand1=0, strand2=0;
888   Uint1Ptr strandptr=NULL;
889         
890   if(!segs)
891   {
892     ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
893   } 
894   else if(segtype==2||segtype==4)
895   {
896     if(segtype==2)
897     {
898       dsp=(DenseSegPtr)segs;
899       strandptr=dsp->strands;
900       sip=dsp->ids;
901       numseg=dsp->numseg;
902       aligndim=dsp->dim;
903     }     
904     else if(segtype==4)
905     {
906       psp=(PackSegPtr)segs;
907       strandptr=psp->strands;
908       sip=psp->ids;
909       numseg=psp->numseg;
910       aligndim=psp->dim;
911     }
912 
913     dimnumseg=numseg*aligndim;
914     if(strandptr)
915     {     
916       /*go through id for each alignment sequence*/
917       for(j=0; j<aligndim; j++)
918       {
919         /* first  strand value for each sequence*/ 
920         strand1=strandptr[j];
921         /* go through all strand values for each sequence*/  
922         for(i=j+aligndim; i<dimnumseg; i=i+aligndim)
923         {          
924           strand2=strandptr[i];
925           
926           if(strand1==0||strand1==255)
927           {
928             strand1=strand2;
929             continue;
930           }
931           
932           /*skip undefined strand*/
933           if(strand2!=0&&strand2!=255) 
934           {
935             /*strand should be same for a given seq*/ 
936             if(strand1!=strand2)
937             {
938               /*find current seqid*/
939             
940               siptemp=sip;
941               for(m=0; m<j&&siptemp!=NULL; m++)
942               {
943                 siptemp=siptemp->next;
944               }
945               ValMessage (salp, Err_Strand_Rev, SEV_ERROR, siptemp, sip, i/aligndim+1);
946             }
947           }
948         }
949       }
950     }
951   }
952 }
953 
954 
955 
956 
957 /******************************************************************
958 check if the  strand is consistent in SeqAlignment of global 
959 or partial type
960 ******************************************************************/ 
961 static void ValidateStrandinSeqAlign(SeqAlignPtr salp)
962 {
963   StdSegPtr ssp=NULL ;
964   
965   if(salp)
966     {
967    
968       /*Strands needs to be validated  in case of global or partial alignment*/ 
969      
970       /*denseseg or packseg*/
971       if(salp->segtype==2||salp->segtype==4)
972  
973     ValidateStrandInPack_DenseSeg(salp->segs, salp->segtype, salp);
974 
975       /*stdseg*/
976       else if(salp->segtype==3)
977     {
978       ssp=(StdSegPtr)salp->segs;
979       ValidateStrandInStdSeg(ssp, salp);
980     }
981    } 
982 }
983 
984 
985 
986 /******************************************************************
987 Make sure that, in Densediag alignment, segment length and 
988 start point is not less than zero, and  segment length is not greater 
989 than Bioseq length
990 ******************************************************************/ 
991 static void ValidateSeqlengthInDenseDiag (DenseDiagPtr ddp, SeqAlignPtr salp)
992 {
993   Int4Ptr      stptr=NULL; 
994   DenseDiagPtr ddptemp;
995   Int2         numseg, i;
996   SeqIdPtr     sip=NULL, siptemp;
997   Int4         bslen;
998   BioseqPtr    bsp=NULL;
999   
1000 
1001   if(!ddp)
1002     ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1003   else
1004     {
1005       for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1006     {
1007       sip=ddp->id;
1008       stptr=ddptemp->starts;
1009       
1010       if(stptr)
1011         {
1012           for(i=0, siptemp=sip; i<ddptemp->dim; i++, siptemp=siptemp->next)
1013         {
1014           bsp=AlignValBioseqLockById(siptemp);
1015           if(bsp)
1016             {
1017               bslen=bsp->length; 
1018               AlignValBioseqUnlock (bsp);
1019               /*verify start*/
1020               if(stptr[i]<0)
1021             ValMessage (salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg);     
1022               if(stptr[i]>=bslen)
1023             ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg); 
1024               
1025               /*verify length*/
1026               
1027               if(ddptemp->len<0)
1028             ValMessage (salp, Err_Len_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg); 
1029               
1030               if(ddptemp->len+stptr[i]>bslen)
1031             ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , numseg);  
1032             }
1033         }
1034         }
1035     }
1036     }
1037 }
1038 
1039 
1040 /******************************************************************
1041 return a new copy of len array in reversed order 
1042 ******************************************************************/ 
1043 static Int4Ptr GetReverseLength (Int2 numseg, Int4Ptr lenptr)
1044 {
1045   Int4Ptr lenptrtemp=NULL;
1046   Int2 p;
1047   
1048   if(!lenptr)
1049     return NULL;
1050 
1051   lenptrtemp=(Int4Ptr)MemNew(numseg*sizeof(Int4Ptr));
1052   if(!lenptrtemp)
1053   {
1054       ErrPostEx (SEV_ERROR, 0,0,  "Warning:insufficient memory");
1055       return NULL;
1056   }
1057   for(p=0; p<numseg; p++)    
1058     lenptrtemp[p]=lenptr[numseg-1-p];
1059   return lenptrtemp;
1060 
1061 }
1062 
1063 /******************************************************************
1064 return a new copy of start array in reversed "numseg" order .  
1065 Note that the relative position of starts in each numseg has not changed.  
1066 Example:  original length={0, 0, 10, -1, 30, 10}, numseg=3, 
1067 lens={10, 20, 40}, the reversed length={30, 10, 10, -1, 0, 0}
1068 ******************************************************************/ 
1069 static Int4Ptr GetReverseStart(Int2 numseg, Int2 dim, Int4Ptr stptr)
1070 {
1071   Int4Ptr stptrtemp=NULL;
1072   Int2 p, q;
1073 
1074   if(!stptr)
1075     return NULL;
1076 
1077   stptrtemp=(Int4Ptr)MemNew(numseg*dim*sizeof(Int4Ptr));
1078   if(!stptrtemp)
1079   {
1080       ErrPostEx (SEV_ERROR, 0,0,  "Warning:insufficient memory"); 
1081       return NULL; 
1082   }
1083   for(p=0; p<numseg; p++)
1084     for(q=0; q<dim; q++)
1085       stptrtemp[q+p*dim]=stptr[q+(numseg-1-p)*dim];
1086 
1087   return stptrtemp;
1088 }
1089 
1090  
1091 
1092 /******************************************************************
1093 Make sure that, in Denseseg alignment, segment length and 
1094 start point agrees each other and the sum of segment length 
1095 is not greater than Bioseq length
1096 ******************************************************************/ 
1097 static void ValidateSeqlengthInDenseSeg (DenseSegPtr dsp, SeqAlignPtr salp)
1098 {
1099 
1100   Int4Ptr      lenptr=NULL, stptr=NULL, lenptrtemp=NULL, stptrtemp=NULL, lenptrtemp2=NULL, stptrtemp2=NULL;
1101   
1102   Int2         numseg, aligndim, i, j;
1103   SeqIdPtr     sip=NULL, siptemp;
1104   Int4         bslen = 0;
1105   BioseqPtr    bsp=NULL;
1106 
1107  if(!dsp)
1108    ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1109  else
1110     {
1111       numseg=dsp->numseg;
1112       aligndim=dsp->dim;     
1113       
1114       stptr=dsp->starts;
1115       lenptr=dsp->lens;
1116       sip=dsp->ids;
1117      
1118       if(stptr==NULL||lenptr==NULL)
1119     return;
1120       
1121   
1122       /*go through each sequence*/
1123       for(j=0, siptemp=sip; j<aligndim&&siptemp; j++, siptemp=siptemp->next)
1124     {
1125        
1126       lenptrtemp=lenptr;
1127       stptrtemp=stptr;
1128       /*if on minus strand, use reversed length and start array*/
1129       if(dsp->strands)
1130         {
1131           if(dsp->strands[j]==Seq_strand_minus)
1132         {
1133           if(!lenptrtemp2&&!stptrtemp2)
1134             {
1135               lenptrtemp2= GetReverseLength (numseg, lenptr);
1136               if (lenptrtemp2==NULL)
1137                  return;
1138               stptrtemp2= GetReverseStart (numseg, aligndim, stptr);
1139               if (stptrtemp2==NULL)
1140                  return;
1141             }
1142           lenptrtemp=lenptrtemp2;
1143           stptrtemp=stptrtemp2;
1144         }
1145         }
1146 
1147       bsp=AlignValBioseqLockById(siptemp);
1148       if(bsp!=NULL)
1149         {
1150           bslen=bsp->length;  
1151           AlignValBioseqUnlock (bsp);
1152         }
1153 
1154       /*go through each segment for a given sequence*/
1155       for(i=0; i<numseg; i++)
1156         {
1157        
1158           /*no need to verify if segment is not present*/
1159           if(stptrtemp[j+i*aligndim]!=-1)
1160         {
1161  
1162           /*length plus start should be equal to next start*/
1163           /*check a start if it's not the last one and the next start is not -1*/
1164           if(i!=numseg-1&&stptrtemp[j+(i+1)*aligndim]!=-1)
1165             {      
1166               
1167               if(stptrtemp[j+i*aligndim]+lenptrtemp[i]!=stptrtemp[j+(i+1)*aligndim]) 
1168             {
1169               if (dsp->strands)
1170                 {
1171                   if(dsp->strands[j]==2)
1172                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , numseg-i); 
1173                   else
1174                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);    
1175                 }
1176                           else
1177                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);
1178             }
1179             }
1180           /*check a start if it's not the last one and the next start is -1*/
1181           else if (i!=numseg-1&&stptrtemp[j+(i+1)*aligndim]==-1)
1182             {
1183               Int4 k=i+1;
1184               /*find the next start that is not last and not -1*/
1185               while(k<numseg&&stptrtemp[j+k*aligndim]==-1)
1186             k++;
1187 
1188               /*length plus start should be equal to the closest next start that is not -1*/           
1189      
1190               if(k<numseg&&stptrtemp[j+i*aligndim]+lenptrtemp[i]!=stptrtemp[j+k*aligndim])
1191             {
1192               if (dsp->strands)
1193                 {
1194                   if(dsp->strands[j]==2)
1195                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , numseg-i); 
1196                   else
1197                  ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1); 
1198                 }
1199                           else
1200                 ValMessage (salp, Err_Denseg_Len_Start, SEV_ERROR, siptemp, sip , i+1);    
1201             }
1202             }
1203           
1204           
1205          /*make sure the start plus segment does not exceed total bioseq length*/ 
1206           if(bsp!=NULL)
1207             {
1208               
1209               if(stptrtemp[j+i*aligndim]+lenptrtemp[i]>bslen)
1210             if (dsp->strands)
1211               {
1212                 if(dsp->strands[j]==2)
1213                   ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , numseg-1); 
1214                 else
1215                   ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , i+1); 
1216               }
1217             else
1218               ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip , i+1); 
1219             }
1220           
1221         }
1222                     
1223         }        
1224     }
1225     }        
1226 
1227 
1228  MemFree(lenptrtemp2);
1229  MemFree(stptrtemp2);
1230                   
1231 
1232 }
1233 
1234 /******************************************************************
1235 Make sure that, in Seqloc of a Stdseg alignment, 
1236 end point, start point and length are not less than zero, 
1237 and are not greater than Bioseq length
1238 ******************************************************************/ 
1239 static void ValidateSeqlengthInStdSeg (StdSegPtr ssp, SeqAlignPtr salp)
1240 { 
1241   StdSegPtr    ssptemp;
1242   Int2         numseg;
1243   SeqIdPtr     sip=NULL, siptemp;
1244   Int4         start, end, length, bslen;
1245   BioseqPtr    bsp=NULL;
1246   SeqLocPtr    slp=NULL, slptemp;
1247 
1248   if(!ssp) {
1249     ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1250   } else {
1251     for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++) { 
1252       /*get all seqid in current segment*/
1253       sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);         
1254       slp=ssptemp->loc;
1255       if(slp==NULL)
1256         return;
1257       for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next) { 
1258         siptemp=SeqLocId(slptemp);
1259         start=SeqLocStart(slptemp);
1260         end=SeqLocStop(slptemp);
1261         length=SeqLocLen(slptemp);
1262         
1263         bsp=AlignValBioseqLockById(siptemp);
1264         if(bsp) {
1265           bslen=bsp->length;
1266           AlignValBioseqUnlock (bsp);
1267      
1268           /*verify start*/
1269           if(start<0) {
1270             ValMessage (salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1);      
1271           }
1272             
1273           if(start>bslen-1) {
1274             ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1); 
1275           }
1276           
1277             /*verify end*/
1278           if(end<0) {
1279             ValMessage (salp, Err_End_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1); 
1280           }
1281           if(end>bslen-1) {
1282             ValMessage (salp, Err_End_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1); 
1283           }
1284                                   
1285           /*verify length*/
1286           if(length<0) {
1287             ValMessage (salp, Err_Len_Less_Than_Zero, SEV_ERROR, siptemp, sip , numseg+1); 
1288           }
1289             
1290           if(length>bslen) {
1291             ValMessage (salp, Err_Len_More_Than_Biolen, SEV_ERROR, siptemp, sip , numseg+1);  
1292           }
1293         
1294         }
1295       }
1296       /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/      
1297       SeqIdSetFree(sip);
1298     }
1299   }
1300 }
1301 
1302 /******************************************************************
1303 validate the start and segment length in packseg
1304 ******************************************************************/ 
1305 static void ValidateSeqlengthInPackSeg (PackSegPtr psp, SeqAlignPtr salp)
1306 {
1307   Uint1Ptr     seqpresence=NULL;
1308   Int2         numseg, aligndim, i, j; 
1309   SeqIdPtr     sip=NULL, siptemp;
1310   Int4Ptr      stptr=NULL, lenptr=NULL; 
1311   BioseqPtr    bsp=NULL;
1312   Int4         bslen, seg_start;
1313 
1314   if(!psp)
1315     ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1316   else
1317     {
1318       numseg=psp->numseg;
1319       aligndim=psp->dim;           
1320       sip=psp->ids;
1321       stptr=psp->starts;
1322       lenptr=psp->lens;
1323     
1324       if(stptr&&lenptr)
1325     {
1326       if(psp->present)
1327         {
1328           BSSeek(psp->present, 0, SEEK_SET);
1329           seqpresence=MemNew(BSLen(psp->present));
1330           if(!seqpresence)
1331         {
1332           
1333           ErrPostEx (SEV_ERROR, 0,0,  "Warning:insufficient memory");
1334           return;
1335           
1336         }
1337           BSRead(psp->present, seqpresence, BSLen(psp->present));
1338           /*go through each sequence*/
1339           for(j=0, siptemp=sip; j<aligndim && siptemp != NULL; siptemp=siptemp->next, j++)
1340         {  
1341           bsp=AlignValBioseqLockById(siptemp);
1342           if(bsp)
1343             {
1344               bslen=bsp->length; 
1345               AlignValBioseqUnlock (bsp);
1346               seg_start=stptr[j];
1347               /*check start*/
1348               if(seg_start<0)
1349             ValMessage (salp, Err_Start_Less_Than_Zero, SEV_ERROR, siptemp, sip , 0);     
1350               if(seg_start>=bslen)
1351             ValMessage (salp, Err_Start_More_Than_Biolen, SEV_ERROR, siptemp, sip , 0);
1352               
1353               /*go through each segment*/
1354               for(i=0; i<numseg; i++)
1355             {
1356               /*if this segment is present*/
1357               if(seqpresence[(i*aligndim+j)/8]&jybitnum[(i*aligndim+j)%8])      
1358                 {
1359                   /*check start plus seg length*/
1360                   seg_start=seg_start+lenptr[i];
1361                   if(seg_start>bslen)
1362                  ValMessage (salp, Err_Sum_Len_Start, SEV_ERROR, siptemp, sip, numseg);
1363                 }
1364             }
1365             }
1366         }
1367         }
1368     }
1369     }
1370   MemFree(seqpresence);         
1371 }
1372 
1373 /******************************************************************
1374 check segment length, start and end point in Denseseg, Densediag and Stdseg
1375 ******************************************************************/ 
1376 static void  ValidateSeqlengthinSeqAlign (SeqAlignPtr salp)
1377 {
1378    
1379   if (salp)
1380   { 
1381       if(salp->segtype==1)
1382     ValidateSeqlengthInDenseDiag ((DenseDiagPtr)salp->segs, salp);
1383       else if(salp->segtype==2)
1384     ValidateSeqlengthInDenseSeg ((DenseSegPtr)salp->segs, salp);
1385       else if(salp->segtype==3)
1386     ValidateSeqlengthInStdSeg ((StdSegPtr)salp->segs, salp);
1387       else if(salp->segtype==4)
1388     ValidateSeqlengthInPackSeg ((PackSegPtr)salp->segs, salp);
1389   }
1390 }
1391 
1392 /******************************************************************
1393 check if # of seqid matches the dimensions, and 
1394 if there is only one seqeuence in seqalign
1395 ******************************************************************/ 
1396 static void ValidateDimSeqIds (SeqAlignPtr salp)
1397 {
1398   SeqIdPtr sip=NULL;
1399   DenseDiagPtr ddp=NULL, ddptemp;
1400   StdSegPtr ssp=NULL, ssptemp;
1401   DenseSegPtr dsp=NULL;
1402   Int4 numseg=0;
1403   
1404  if(salp)
1405    {
1406      /*densediag */
1407      if(salp->segtype==1)
1408        {
1409      
1410      ddp=(DenseDiagPtr)salp->segs;
1411      if(!ddp)
1412        ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1413      else
1414        for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1415          {
1416            sip=ddptemp->id;
1417            if(ddptemp->dim==1)
1418          ValMessage (salp, Err_Segs_Dim_One, SEV_ERROR, NULL, sip , numseg+1);
1419            if(ddptemp->dim!=CountSeqIdInSip(sip))          
1420          ValMessage (salp, Err_Segs_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , numseg+1);
1421           
1422          }
1423        }
1424      
1425      /*denseseg, packseg */
1426      else if(salp->segtype==2||salp->segtype==4)
1427        {
1428      dsp=(DenseSegPtr) (salp->segs);
1429      if(!dsp)
1430        ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1431      else
1432        {
1433          sip=dsp->ids;
1434          if(dsp->dim==1)
1435            ValMessage (salp, Err_SeqAlign_Dim_One, SEV_ERROR, NULL, sip , 0); 
1436          if(dsp->dim!=CountSeqIdInSip(sip)) 
1437             ValMessage (salp, Err_SeqAlign_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , 0); 
1438            
1439        }
1440        }
1441      
1442      /*stdseg */
1443      else if(salp->segtype==3)
1444        {
1445      
1446      ssp=(StdSegPtr)salp->segs;
1447      if(!ssp)
1448        ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1449      else
1450        for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++)
1451          {
1452            
1453            sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1454            if(ssptemp->dim==1)
1455          ValMessage (salp, Err_Segs_Dim_One, SEV_ERROR, NULL, sip , numseg+1);
1456            if(ssptemp->dim!=CountSeqIdInSip( sip)) 
1457          ValMessage (salp, Err_Segs_DimSeqId_Not_Match, SEV_ERROR, NULL, sip , numseg+1);
1458            /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1459            
1460            SeqIdSetFree(sip);
1461          }
1462        }
1463    }
1464 }
1465 
1466 /******************************************************************
1467 return true if a sip is contained in a seg, or false if otherwise 
1468 Note it returns FASLE for an empty seqloc.  
1469 It also returns false if error in sip or ssp
1470 ******************************************************************/ 
1471 static Boolean IsSipContainedInStdseg(SeqIdPtr sip, StdSegPtr ssp)
1472 {
1473   SeqLocPtr slp, slptemp;
1474   
1475   if(!sip||!ssp)
1476     return FALSE;
1477 
1478   slp=ssp->loc;
1479   for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
1480     {
1481       if(slptemp->choice!=SEQLOC_EMPTY&&SeqIdCmp(sip, SeqLocId(slptemp)))
1482     return TRUE;
1483     }
1484   
1485   return FALSE;
1486 }
1487 
1488 static Int4 PercentStringMatch (CharPtr string1, CharPtr string2)
1489 {
1490   Int4 len1, len2, min_len, k, max_len;
1491   Int4 num_match = 0;
1492   
1493   if (StringHasNoText (string1) || StringHasNoText (string2))
1494   {
1495       return 0;
1496   }
1497   len1 = StringLen (string1);
1498   len2 = StringLen (string2);
1499   
1500   if (len1 > len2)
1501   {
1502       min_len = len2;
1503       max_len = len1;
1504   }
1505   else
1506   {
1507       min_len = len1;
1508       max_len = len2;
1509   }
1510   
1511   for (k = 0; k < min_len; k++)
1512   {
1513       if (string1[k] == string2[k] || string1[k] == 'N' || string2[k] == 'N')
1514       {
1515         num_match++;
1516       }
1517   }
1518   return (100 * num_match) / min_len;
1519 }
1520 
1521 static Boolean CheckForPercentMatch (SeqIdPtr sip_list)
1522 {
1523   SeqIdPtr  sip_temp, sip_next;
1524   BioseqPtr bsp;
1525   CharPtr   master_seq = NULL, this_seq = NULL;  
1526   
1527   if (sip_list == NULL) return FALSE;
1528   sip_next = sip_list->next;
1529   sip_list->next = NULL;
1530   bsp = BioseqFind (sip_list);
1531   if (bsp != NULL)
1532   {
1533     master_seq = GetSequenceByBsp (bsp);      
1534   }
1535   sip_list->next = sip_next;
1536   sip_temp = sip_next;
1537   if (bsp == NULL || master_seq == NULL) 
1538   {
1539       return FALSE;
1540   }
1541   
1542   for (sip_temp = sip_next; sip_temp != NULL; sip_temp = sip_next)
1543   {
1544       sip_next = sip_temp->next;
1545       sip_temp->next = NULL;
1546       
1547       bsp = BioseqFind (sip_temp);
1548       if (bsp != NULL)
1549       {
1550         this_seq = GetSequenceByBsp (bsp);
1551       } else {
1552         this_seq = NULL;
1553       }
1554       
1555       sip_temp->next = sip_next;
1556       if (bsp == NULL || StringHasNoText (this_seq) || PercentStringMatch (master_seq, this_seq) < 50)
1557       {
1558         MemFree (this_seq);
1559         return FALSE;
1560       }
1561       MemFree (this_seq);
1562   }
1563   return TRUE;
1564 }
1565 
1566 
1567 /******************************************************************
1568 check if an alignment is FASTA-like.  
1569 If all gaps are at the 3' ends with dimensions>2, it's FASTA-like
1570 ******************************************************************/ 
1571 static Boolean Is_Fasta_Seqalign (SeqAlignPtr salp)
1572 {
1573 
1574   SeqIdPtr    siptemp=NULL;
1575   DenseSegPtr dsp;
1576   Int4Ptr     startp;
1577   Boolean     gap;
1578   Int4        k;
1579   Int2        j;
1580   SeqIdPtr    bad_sip = NULL;
1581   
1582   /*check only global or partial type*/
1583   if(salp->type!=1&&salp->type!=3)
1584     return FALSE;
1585 
1586   if (salp->segtype != SAS_DENSEG) {
1587     ValMessage (salp, Err_Unexpected_Alignment_Type, SEV_ERROR, NULL, NULL, 0);
1588   } else {
1589     dsp = (DenseSegPtr) salp->segs;
1590     if(!dsp)
1591     {
1592       ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1593     }
1594     else
1595     {
1596       if(dsp->dim<=2)
1597       {
1598         return FALSE;
1599       }
1600       /* if any sequence has gaps at the 5' end or internal gaps, the entire
1601        * alignment is declared to be valid.
1602        * if the sequence contains no gaps at all or only 3' end gaps, check
1603        * sequences for matches against the first sequence - if more than half
1604        * of the nucleotides are matches, then call this not FASTA-like.
1605        */ 
1606       for (j=0, siptemp=dsp->ids; j<dsp->dim&&siptemp; j++, siptemp=siptemp->next)
1607       {
1608         gap=FALSE;
1609           
1610         for (k=0; k<dsp->numseg; k++)
1611         {
1612           startp=dsp->starts;
1613           
1614           /*if start value is -1, set gap flag to true*/
1615           if (startp[dsp->dim*k + j] < 0)
1616           {
1617             gap = TRUE;              
1618           }
1619           /*if a positive start value is found after the initial -1 start value, then it's not  fasta like, no need to check this sequence further */
1620           else if(gap)
1621           {
1622             if (bad_sip != NULL)
1623             {
1624               SeqIdFree (bad_sip);
1625             }
1626             return FALSE;              
1627           }
1628           /* if no positive start value is found after the initial -1 start value
1629            * (indicating that gaps exist only at the 5' end) or if no gaps
1630            * were found at all, flag this sequence as bad if it is the first found.
1631            */
1632           if(k==dsp->numseg-1)
1633           {
1634             if (bad_sip == NULL)
1635             {
1636               bad_sip = SeqIdDup (siptemp);
1637             }
1638           }
1639         }
1640       }
1641       if (bad_sip != NULL)
1642       {
1643         if (! CheckForPercentMatch (dsp->ids))
1644         {
1645           ValMessage (salp, Err_Fastalike, SEV_WARNING, bad_sip, dsp->ids, 0);
1646           SeqIdFree (bad_sip);
1647           return TRUE;        
1648         }
1649         SeqIdFree (bad_sip);
1650         return FALSE;
1651       }
1652     }
1653   }
1654   /*no fasta like sequence is found*/
1655   return FALSE;
1656   
1657 }  
1658   
1659  
1660 
1661 /******************************************************************
1662 check if there is a gap for all sequence in a segment
1663 ******************************************************************/ 
1664 static void Segment_Gap_In_SeqAlign(SeqAlignPtr salp)
1665 {
1666   Int4Ptr      stptr=NULL;
1667   DenseSegPtr  dsp=NULL;
1668   DenseDiagPtr ddp=NULL, ddptemp;
1669   StdSegPtr    ssp=NULL, ssptemp;
1670   PackSegPtr   psp=NULL;
1671   Uint1Ptr     seqpresence=NULL;
1672   Int2         numseg, aligndim, i, j; 
1673   SeqIdPtr     sip=NULL;
1674   SeqLocPtr    slp=NULL, slptemp;
1675   
1676 
1677   if(salp)
1678     {
1679       /*densediag*/
1680       if(salp->segtype==1)
1681     {
1682       ddp=(DenseDiagPtr)salp->segs;
1683       if(!ddp)
1684         ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1685       else
1686         {
1687           for(ddptemp=ddp, numseg=0; ddptemp!=NULL; ddptemp=ddptemp->next, numseg++)
1688         {
1689           sip=ddptemp->id;
1690           /*empty segment*/
1691           if(ddptemp->dim==0)   
1692             ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, numseg);
1693         }
1694         }
1695     }
1696  
1697    
1698       /*denseseg*/
1699      else if(salp->segtype==2)
1700     {
1701       dsp=(DenseSegPtr)salp->segs;
1702       if(!dsp)
1703         ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1704       else
1705         {
1706           numseg=dsp->numseg;
1707           aligndim=dsp->dim;           
1708           stptr=dsp->starts;
1709           sip=dsp->ids;
1710           
1711           if(stptr==NULL)
1712         return;
1713           
1714           /*go through each segment*/
1715           for(j=0; j<numseg; j++)
1716         {    
1717           /*go through each sequence */
1718           for(i=0; i<aligndim; i++)
1719             {
1720               
1721               if(stptr[j*aligndim+i]==-1)
1722             {  
1723               /*all starts are -1 in this segment*/
1724               if(i==aligndim-1)
1725                 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, j);
1726             }
1727               /*at least one start that is not -1*/
1728               else
1729             break;
1730               
1731             }
1732         }
1733         }
1734     }
1735 
1736         /*stdseg*/
1737      else if(salp->segtype==3)
1738     {
1739       ssp=(StdSegPtr)salp->segs;
1740       if(!ssp)
1741         ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1742       else
1743         {
1744           /*go through each segment*/
1745           for(ssptemp=ssp, numseg=0; ssptemp!=NULL; ssptemp=ssptemp->next, numseg++)
1746         {
1747           sip=SeqIdInAlignSegs((Pointer)ssptemp, 3, salp);
1748           slp=ssptemp->loc;
1749           /*go through each sequence*/
1750           for(slptemp=slp; slptemp!=NULL; slptemp=slptemp->next)
1751             { 
1752               if(slptemp->choice==SEQLOC_EMPTY||slptemp->choice==SEQLOC_NULL) 
1753             {
1754               if(slptemp->next)   
1755                 continue;
1756               /*all seqloc are empty*/ 
1757               else
1758                 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, numseg);
1759             }
1760               /*at least one non-empty seqloc*/
1761               else
1762             break;
1763             }
1764           /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1765           SeqIdSetFree(sip);
1766  
1767         }
1768         }
1769     }
1770       /*packseg*/
1771       else if(salp->segtype==4)
1772     {
1773       psp=(PackSegPtr)salp->segs;
1774       if(!psp)
1775         ValMessage (salp, Err_Null_Segs, SEV_ERROR, NULL, NULL, 0);
1776       else
1777         {
1778           numseg=psp->numseg;
1779           aligndim=psp->dim;           
1780           sip=psp->ids;
1781           if(psp->present)
1782         {
1783           BSSeek(psp->present, 0, SEEK_SET);
1784           seqpresence=MemNew(BSLen(psp->present));
1785           if(!seqpresence)
1786             {
1787               ErrPostEx (SEV_ERROR, 0,0,  "Warning:insufficient memory");
1788               return;
1789               
1790             }
1791           BSRead(psp->present, seqpresence, BSLen(psp->present));
1792           
1793           /*go through each segment*/
1794           for(j=0; j<numseg; j++)
1795             {    
1796               /*go through each sequence */
1797               for(i=0; i<aligndim; i++)
1798             {
1799               /*check the presence of each sequence by determining the bit value in a byte (0, not present; otherwise present)*/
1800               if(!(seqpresence[(j*aligndim+i)/8]&jybitnum[(j*aligndim+i)%8]))      
1801                 {
1802                   /*more sequence to go*/
1803                   if(i<aligndim-1)   
1804                 continue;
1805                   /*no sequence is present in this segment*/
1806                   else if(i==aligndim-1)
1807                 ValMessage (salp, Err_Segment_Gap, SEV_ERROR, NULL, sip, j);
1808                 }
1809               /*at least one sequence is present*/
1810               else
1811                 break;
1812             }
1813             }
1814         MemFree(seqpresence);
1815         }
1816         }
1817     
1818     }
1819 
1820 
1821     }
1822 }
1823 
1824 
1825 static Boolean IsAlignmentTPA (SeqAlignPtr salp)
1826 {
1827   Boolean isTPA = FALSE;
1828   BioseqPtr bsp;
1829   SeqIdPtr  sip = NULL, tmp_sip;
1830   SeqEntryPtr oldscope;
1831   DenseDiagPtr ddp;
1832   StdSegPtr    ssp;
1833 
1834   if (salp == NULL) {
1835     return FALSE;
1836   }
1837 
1838   oldscope = SeqEntrySetScope (NULL);
1839 
1840   switch (salp->segtype) {
1841     case SAS_DENDIAG:
1842       /*densediag */
1843       for (ddp = (DenseDiagPtr) salp->segs; ddp != NULL && !isTPA; ddp = ddp->next) {
1844         for (sip = SeqIdInAlignSegs((Pointer)ddp, salp->segtype, salp);
1845              sip != NULL && !isTPA;
1846              sip = sip->next) {
1847           bsp = BioseqLockById(sip);
1848           isTPA = HasTpaUserObject(bsp);
1849           BioseqUnlock(bsp);
1850         }
1851       }
1852       break;
1853     case SAS_STD: 
1854       /*Stdseg*/
1855       for (ssp = (StdSegPtr) salp->segs; ssp != NULL && !isTPA; ssp = ssp->next) {
1856         sip = SeqIdInAlignSegs((Pointer)ssp, salp->segtype, salp);
1857         for (tmp_sip = sip;
1858              tmp_sip != NULL && !isTPA;
1859              tmp_sip = tmp_sip->next) {
1860           bsp = BioseqLockById(tmp_sip);
1861           isTPA = HasTpaUserObject(bsp);
1862           BioseqUnlock(bsp);
1863         }
1864       }
1865       /*free Seqid if sip is a new chain created by SeqIdinAlignSegs*/
1866       SeqIdSetFree(sip);
1867       break;
1868     case SAS_DENSEG:
1869     case SAS_PACKED:
1870       /*Denseseg, Packseg*/
1871       for (sip=SeqIdInAlignSegs(salp->segs, salp->segtype, salp);
1872            sip != NULL && !isTPA;
1873            sip = sip->next) {
1874         bsp = BioseqLockById(sip);
1875         isTPA = HasTpaUserObject(bsp);
1876         BioseqUnlock(bsp);
1877       }
1878       break;
1879   }
1880 
1881   SeqEntrySetScope (oldscope);  
1882   return isTPA;
1883 }
1884 
1885 
1886 static void CheckAlnSeqLens (SeqAlignPtr salp)
1887 {
1888   Int4     aln_len, start, stop;
1889   Int4     num_rows, row;
1890   SeqIdPtr sip;
1891   BioseqPtr bsp;
1892   Boolean   is_shorter = FALSE;
1893 
1894   if (salp == NULL) return;
1895 
1896   aln_len =  AlnMgr2GetAlnLength(salp, FALSE);
1897   num_rows = AlnMgr2GetNumRows(salp);
1898   if (num_rows < 0) {
1899     return;
1900   }
1901 
1902   for (row = 1; row <= num_rows && !is_shorter; row++) {
1903     sip = AlnMgr2GetNthSeqIdPtr(salp, row);
1904     bsp = BioseqFind (sip);
1905     if (bsp != NULL && bsp->idx.entityID == salp->idx.entityID) {
1906       AlnMgr2GetNthSeqRangeInSA(salp, row, &start, &stop);
1907       if ((stop > start && stop < bsp->length - 1) || (start > stop && start > bsp->length - 1)) {
1908         is_shorter = TRUE;
1909       }
1910     }
1911     sip = SeqIdFree (sip);
1912   }
1913   if (is_shorter) {
1914     ValMessage (salp, Err_Short_Aln, SEV_INFO, NULL, NULL, 0);
1915   }
1916 }
1917 
1918  
1919 /******************************************************************
1920 validate seqid, segment length, strand in Seqalignment for Denseseg, 
1921 Densediag and Stdseg.  Also check if it's FASTA-like
1922 ******************************************************************/ 
1923 static Boolean ValidateSeqAlignFunc (SeqAlignPtr salp, Boolean find_remote_bsp)
1924 {
1925   Boolean   error=FALSE;
1926   Uint2     pcnt_identity;
1927   SeqAlignPtr salp_test;
1928   
1929   if(salp==NULL)
1930     return FALSE;
1931 
1932   /*validate if dimesion equals number of seqid*/     
1933   ValidateDimSeqIds (salp);
1934         
1935   if (find_remote_bsp) {
1936     ValidateSeqIdInSeqAlign (salp);
1937     ValidateSeqlengthinSeqAlign (salp);
1938   }
1939   /*validate strand*/
1940   ValidateStrandinSeqAlign (salp);
1941        
1942   /*validate Fasta like*/
1943   if (Is_Fasta_Seqalign (salp))
1944   {
1945       error = TRUE;
1946   }
1947       
1948   /*validate segment gap*/
1949   Segment_Gap_In_SeqAlign (salp);
1950   
1951   if (!IsAlignmentTPA(salp)) {
1952     if (salp->segtype == SAS_DENDIAG) {
1953       /* duplicate alignment, to prevent indexing from changing the original type */
1954       salp_test = SeqAlignDup (salp);
1955       pcnt_identity = AlignmentPercentIdentityEx (salp_test, FALSE, TRUE);
1956       salp_test = SeqAlignFree (salp_test);
1957     } else {
1958       pcnt_identity = AlignmentPercentIdentityEx (salp, FALSE, TRUE);
1959     }
1960 
1961     if (pcnt_identity < 50) {
1962       ValMessage (salp, Err_Pcnt_ID, SEV_WARNING, NULL, NULL, pcnt_identity);
1963     }
1964 
1965 /*    CheckAlnSeqLens (salp); */
1966   }
1967   
1968   return error;
1969 }
1970 
1971 
1972 /******************************************************************
1973 validate each alignment sequentially.  
1974 This function will subject the seqalign to all validation functions
1975 ******************************************************************/ 
1976 NLM_EXTERN Boolean ValidateSeqAlign (SeqAlignPtr salp, Uint2 entityID, Boolean message,
1977                          Boolean msg_success, Boolean find_remote_bsp,
1978                          Boolean delete_bsp, Boolean delete_salp, BoolPtr dirty)
1979 {  
1980   SeqAlignPtr  pre,
1981                salptmp;
1982   SaVal        sv;
1983   SaValPtr     svp;
1984   ValNodePtr   vnp;
1985   JYErrorMsgPtr bemp;
1986   MsgAnswer    ans;
1987   Int2         err_count=0,
1988                salp_count=0;
1989   Boolean      retdel = FALSE; 
1990 
1991   if(salp!=NULL)
1992   {
1993      sv.message = message;
1994      sv.msg_success = msg_success;
1995      sv.find_remote_bsp = find_remote_bsp;
1996      sv.delete_salp = delete_salp;
1997      sv.delete_bsp = delete_bsp;
1998      sv.retdel = TRUE;
1999      sv.do_hist_assembly = FALSE;
2000      sv.ids = NULL;
2001      sv.entityID = entityID; 
2002      sv.dirty = FALSE;   
2003      svp = &sv;   
2004      pre=NULL;
2005      salptmp=salp; 
2006      while (salptmp)
2007      {
2008         salp_count++;
2009         if (salptmp->segtype == SAS_SPARSE) {
2010            ValMessage (salp, Err_Segtype, SEV_WARNING, NULL, NULL, salptmp->segtype);
2011         } else if (salptmp->segtype == SAS_SPLICED) {
2012            ValMessage (salp, Err_Segtype, SEV_WARNING, NULL, NULL, salptmp->segtype);
2013         }
2014         else if (salptmp->segtype==5)
2015         {
2016            ValidateSeqAlign ((SeqAlignPtr) (salptmp->segs), entityID, message, msg_success, find_remote_bsp, delete_bsp, delete_salp, &svp->dirty);
2017         } 
2018         else if (salptmp->segtype<1 || salptmp->segtype>4)
2019         {
2020            ValMessage (salp, Err_Segtype, SEV_ERROR, NULL, NULL, salptmp->segtype);
2021         }
2022         else {
2023               ValidateSeqAlignFunc (salptmp, svp->find_remote_bsp);
2024         }         
2025            if (errorp)
2026            {
2027               if(svp->message)
2028               {
2029                  for (vnp=errorp; vnp!=NULL; vnp=vnp->next)
2030                  {
2031                     bemp=(JYErrorMsgPtr)vnp->data.ptrvalue;
2032                     ErrPostEx ((ErrSev) bemp->level, 0, 0, bemp->msg);
2033                  }
2034               }
2035               errorp = JYErrorChainDestroy (errorp);
2036               if (svp->delete_salp)
2037               {
2038             if (pre==NULL) {
2039               salp=salptmp->next;
2040               salptmp->next = NULL;
2041               SeqAlignFree (salptmp);
2042               salptmp = salp;
2043             }
2044             else {
2045               pre->next = salptmp->next;
2046               salptmp->next = NULL;
2047               SeqAlignFree (salptmp);
2048               salptmp = pre->next;
2049             }
2050            }
2051               else {
2052                  salptmp = salptmp->next;
2053               }
2054               err_count++;
2055            svp->retdel=FALSE;
2056         }
2057            else {
2058               salptmp = salptmp->next;
2059            }
2060      }
2061      if (err_count==0 && svp->msg_success) {
2062         if (salp_count>1)
2063            ans = Message (MSG_OK, "Validation test of %d alignments succeeded", salp_count);
2064         else
2065            ans = Message (MSG_OK, "Validation test of the alignment succeeded");
2066      }
2067      if (dirty)
2068         *dirty = svp->dirty;
2069      retdel = svp->retdel;
2070   }   
2071   return retdel;
2072 } 
2073 
2074 
2075 /******************************************************************
2076 call back function for REGISTER_ALIGNVALIDATION defined in sequin4.c.  
2077 Starting point for seqalign validation if user clicked on 
2078 SeqalignValidation under menu Filer/Alignment.  
2079 Either individual alignment or alignment block 
2080 should be highlighted for this validation to work
2081 ******************************************************************/ 
2082 
2083 NLM_EXTERN Int2 LIBCALLBACK ValidateSeqAlignFromData (Pointer data)
2084 { 
2085  
2086   OMProcControlPtr  ompcp;
2087   SeqAlignPtr       salp=NULL;
2088   SeqAnnotPtr       sap=NULL;
2089   SeqEntryPtr       sep=NULL;
2090   
2091   ompcp = (OMProcControlPtr) data;
2092   if (ompcp == NULL || ompcp->proc == NULL) return OM_MSG_RET_ERROR;
2093   
2094   if (ompcp->input_data == NULL) return OM_MSG_RET_ERROR;
2095   
2096   switch(ompcp->input_itemtype)
2097     {
2098     case OBJ_BIOSEQ :
2099       sep = SeqMgrGetSeqEntryForData (ompcp->input_data);
2100       break;
2101     case OBJ_BIOSEQSET :
2102       sep = SeqMgrGetSeqEntryForData (ompcp->input_data);
2103       break;
2104       /*if clicked on alignment block*/
2105     case OBJ_SEQANNOT:
2106       sap=(SeqAnnotPtr) (ompcp->input_data);
2107       break;
2108       /*if clicked on individual alignment*/
2109     case OBJ_SEQALIGN:
2110       salp=(SeqAlignPtr) (ompcp->input_data);
2111       break;
2112     case 0 :
2113       return OM_MSG_RET_ERROR;
2114     default :
2115       return OM_MSG_RET_ERROR;
2116   }
2117   
2118   ErrSetMessageLevel(SEV_ERROR);
2119   if(sap!=NULL)
2120   {
2121      salp=is_salp_in_sap(sap, 2);
2122      ValidateSeqAlign (salp, 0, TRUE, TRUE, TRUE, FALSE, FALSE, NULL);
2123   }
2124   if (salp!=NULL) {
2125      ValidateSeqAlign (salp, 0, TRUE, TRUE, TRUE, FALSE, FALSE, NULL);
2126   }
2127   if (sep!=NULL) {
2128      ValidateSeqAlignInSeqEntry (sep, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE);
2129   }
2130   return OM_MSG_RET_DONE;
2131 }
2132 
2133 static void ValidateSeqAlignInAnnot (SeqAnnotPtr sap, SaValPtr svp)
2134 
2135 {
2136   SeqAlignPtr  salp;
2137 
2138   while (sap != NULL) {
2139     if (sap->type == 2) {
2140       salp = (SeqAlignPtr) sap->data;
2141       if (salp != NULL) {
2142         ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty);
2143       }
2144     }
2145     sap = sap->next;
2146   }
2147 }
2148 
2149 static void ValidateSeqAlignInHist (SeqHistPtr hist, SaValPtr svp)
2150 
2151 {
2152   SeqAlignPtr  salp;
2153 
2154   if (hist == NULL) return;
2155   salp = hist->assembly;
2156   /* ValidateSeqAlign will validate the entire chain */
2157   ValidateSeqAlign (salp, svp->entityID, svp->message, svp->msg_success, svp->find_remote_bsp, svp->delete_bsp, svp->delete_salp, &svp->dirty);
2158 }
2159 
2160 static void ValidateSeqAlignCallback (SeqEntryPtr sep, Pointer mydata,
2161                                           Int4 index, Int2 indent)
2162 {
2163   BioseqPtr          bsp;
2164   BioseqSetPtr       bssp;
2165   SaValPtr           svp;
2166 
2167   if (sep != NULL && sep->data.ptrvalue && mydata != NULL) {
2168      svp = (SaValPtr)mydata;
2169      if (IS_Bioseq(sep)) {
2170         bsp = (BioseqPtr) sep->data.ptrvalue;
2171         if (bsp!=NULL) {
2172            ValidateSeqAlignInAnnot (bsp->annot, svp);
2173            if (svp != NULL && svp->do_hist_assembly) {
2174               ValidateSeqAlignInHist (bsp->hist, svp);
2175            }
2176         }
2177      }   
2178      else if(IS_Bioseq_set(sep)) {
2179         bssp = (BioseqSetPtr)sep->data.ptrvalue;
2180         if (bssp!=NULL) {
2181            ValidateSeqAlignInAnnot (bssp->annot, svp);
2182         }
2183      }
2184   }
2185 }
2186 
2187 
2188 
2189 NLM_EXTERN Boolean ValidateSeqAlignInSeqEntry (SeqEntryPtr sep, Boolean message, 
2190                                  Boolean msg_success, Boolean find_remote_bsp, 
2191                                  Boolean delete_bsp, Boolean delete_salp,
2192                                  Boolean do_hist_assembly)
2193 {
2194   SeqEntryPtr      sep_head;
2195   Uint2            entityID;
2196   SaVal            sv;
2197   Boolean          success=TRUE;
2198 
2199   entityID = ObjMgrGetEntityIDForChoice (sep);
2200   if (entityID > 0) {
2201      sep_head = GetTopSeqEntryForEntityID (entityID);
2202      if (sep_head != NULL) {
2203         sv.message = message;
2204         sv.msg_success = msg_success;
2205         sv.find_remote_bsp = find_remote_bsp;
2206         sv.find_acc_bsp = FALSE;
2207         sv.delete_salp = delete_salp;
2208         sv.delete_bsp = delete_bsp;
2209         sv.retdel = TRUE;
2210         sv.do_hist_assembly = do_hist_assembly;
2211         sv.ids = NULL;
2212         sv.entityID = entityID; 
2213         sv.dirty = FALSE;
2214         SeqEntryExplore (sep_head, (Pointer)&sv, ValidateSeqAlignCallback);
2215         if (sv.dirty) {
2216            ObjMgrSetDirtyFlag (entityID, TRUE);
2217            ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0);
2218         }
2219         success = sv.retdel;
2220      }
2221   }
2222   return success;
2223 }
2224 
2225 
2226 /* alignment validator private for regular validator */
2227 
2228 NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep, Boolean find_remote_bsp, Boolean do_hist_assembly);
2229 
2230 NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep, Boolean find_remote_bsp, Boolean do_hist_assembly)
2231 
2232 {
2233   GatherContext  gc;
2234   Boolean        rsult;
2235 
2236   if (vsp == NULL || sep == NULL) return FALSE;
2237   useLockByID = vsp->farIDsInAlignments;
2238   useValErr = TRUE;
2239   useVsp = vsp;
2240   vsp->gcp = &gc;
2241   vsp->bssp = NULL;
2242   vsp->bsp = NULL;
2243   vsp->sfp = NULL;
2244   vsp->descr = NULL;
2245   MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
2246   rsult = ValidateSeqAlignInSeqEntry (sep, FALSE, FALSE, find_remote_bsp, FALSE, FALSE, do_hist_assembly);
2247   useLockByID = TRUE;
2248   useValErr = FALSE;
2249   useVsp = NULL;
2250   return rsult;
2251 }
2252 
2253 
2254 /* PopulateSample and ReadFromAlignmentSample are utility functions for AlignmentPercentIdentity */
2255 static void PopulateSample (Uint1Ptr seqbuf_list, Int4Ptr start_list, 
2256                             Int4 sample_len, BioseqPtr PNTR bsp_list,
2257                             Int4 row)
2258 {
2259   Char ch;
2260   
2261   if (seqbuf_list == NULL || start_list == NULL || sample_len < 1 || row < 0 || bsp_list == NULL
2262       || bsp_list[row] == NULL || start_list[row] < 0 || start_list[row] >= bsp_list[row]->length) {
2263       return;
2264   }
2265 
2266   ch = *(seqbuf_list + (row + 1) * sample_len);
2267 
2268   SeqPortStreamInt (bsp_list[row],
2269                     start_list[row], 
2270                     MIN (start_list[row] + sample_len - 1, bsp_list[row]->length - 1), 
2271                     Seq_strand_plus,
2272                     0,
2273                     seqbuf_list + row * sample_len,
2274                     NULL);
2275 
2276   /* put back char overwritten by SeqPortStreamInt */
2277   *(seqbuf_list + (row + 1) * sample_len) = ch;
2278 
2279 }
2280 
2281 
2282 static Uint1 ComplementChar (Uint1 ch)
2283 {
2284   if (ch == 'A') {
2285     return 'T';
2286   } else if (ch == 'T') {
2287     return 'A';
2288   } else if (ch == 'G') {
2289     return 'C';
2290   } else if (ch == 'C') {
2291     return 'G';
2292   } else {
2293     return ch;
2294   }
2295 }
2296 
2297 static Uint1 ReadFromAlignmentSample(Uint1Ptr seqbuf_list, Int4Ptr start_list, 
2298                                      Int4 sample_len, BioseqPtr PNTR bsp_list,
2299                                      Uint1Ptr strand_list,
2300                                      Int4 row, Int4 seq_pos)
2301 {
2302   Uint1 ch = 0;
2303   
2304   if (seqbuf_list == NULL || start_list == NULL || sample_len < 1 || row < 0 || bsp_list == NULL
2305       || bsp_list[row] == NULL || seq_pos < 0 || seq_pos >= bsp_list[row]->length) {
2306       return 0;
2307   }
2308   
2309   if (seq_pos < start_list[row] || seq_pos >= start_list[row] + sample_len) {
2310     start_list[row] = (seq_pos / sample_len) * sample_len;
2311     PopulateSample (seqbuf_list, start_list, 
2312                     sample_len, bsp_list,
2313                     row);
2314   }
2315   ch = seqbuf_list[(row * sample_len) + seq_pos - start_list[row]];
2316   if (strand_list[row] == Seq_strand_minus) {
2317     ch = ComplementChar(ch);
2318   }
2319   return ch;
2320 }
2321 
2322 typedef struct ambchar {
2323   Char ambig_char;
2324   CharPtr match_list;
2325 } AmbCharData, PNTR AmbCharPtr;
2326 
2327 static const AmbCharData ambiguity_list[] = {
2328  { 'R', "AG" },
2329  { 'Y', "CT" },
2330  { 'M', "AC" },
2331  { 'K', "GT" },
2332  { 'S', "CG" },
2333  { 'W', "AT" },
2334  { 'H', "ACT" },
2335  { 'B', "CGT" },
2336  { 'V', "ACG" },
2337  { 'D', "AGT" }};
2338 
2339 static const Int4 num_ambiguities = sizeof (ambiguity_list) / sizeof (AmbCharData);
2340 
2341 static Char AmbiguousMatch (Char ch1, Char ch2)
2342 {
2343   Int4 i;
2344   for (i = 0; i < num_ambiguities; i++) {
2345     if (ch1 == ambiguity_list[i].ambig_char
2346         && StringChr (ambiguity_list[i].match_list, ch2)) {
2347       return ch2;
2348     } else if (ch2 == ambiguity_list[i].ambig_char
2349         && StringChr (ambiguity_list[i].match_list, ch1)) {
2350       return ch1;
2351     }
2352   }
2353   return 0;
2354 }
2355 
2356 
2357 extern double *
2358 GetAlignmentColumnPercentIdentities 
2359 (SeqAlignPtr salp,
2360  Int4    start,
2361  Int4    stop,
2362  Boolean internal_gaps,
2363  Boolean internal_validation)
2364 {
2365   Int4       aln_len, num_rows, row, col_count = 0;
2366   Int4       num_match;
2367   Int4       aln_pos, seq_pos, k;
2368   Uint1          row_ch;
2369   SeqEntryPtr    oldscope;
2370   SeqIdPtr PNTR  sip_list;
2371   BioseqPtr PNTR bsp_list;
2372   Uint1Ptr       strand_list;
2373   BoolPtr        start_gap, end_gap;
2374   Int4Ptr        start_list;
2375   Uint1Ptr       seqbuf_list;
2376   Int4           sample_len = 50;
2377   Int4           chars_appearing[5]; /* 0 is A, 1 is T, 2 is G, 3 is C, 4 is internal gap */
2378   Int4           max_app, total_app, i;
2379   double *       pct_ids;
2380   
2381   if (salp == NULL || start < 0 || stop < start) return NULL;
2382  
2383   AlnMgr2IndexSingleChildSeqAlign(salp);
2384   aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2385   num_rows = AlnMgr2GetNumRows(salp);
2386   if (num_rows < 0) {
2387     Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2388     return NULL;
2389   }
2390 
2391   pct_ids = (double *) MemNew (sizeof (double) * (stop - start + 1));
2392   MemSet (pct_ids, 0, sizeof (double) * (stop - start + 1));
2393 
2394   bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2395   sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2396   strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2397   start_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2398   end_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2399   for (row = 1; row <= num_rows; row++) {
2400     sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2401     strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2402     bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2403     if (bsp_list[row - 1] == NULL) {
2404       oldscope = SeqEntrySetScope (NULL);
2405       bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2406       SeqEntrySetScope(oldscope);
2407       if (bsp_list[row - 1] == NULL) {
2408         break;
2409       }
2410     }
2411     start_gap[row - 1] = TRUE;
2412     end_gap[row - 1] = FALSE;
2413   }
2414   
2415   if (row <= num_rows) {
2416     Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2417     while (row >= 0) {
2418       sip_list[row] = SeqIdFree(sip_list[row]);
2419       BioseqUnlock(bsp_list[row]);
2420       row--;
2421     }
2422     sip_list = MemFree (sip_list);
2423     bsp_list = MemFree (bsp_list);
2424     start_gap = MemFree (start_gap);
2425     end_gap = MemFree (end_gap);
2426     return 0;
2427   }
2428   
2429   start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2430   seqbuf_list = (Uint1Ptr) MemNew (num_rows * sample_len * sizeof(Uint1));
2431   for (row = 0; row < num_rows; row++) {
2432     start_list[row] = 0;
2433     PopulateSample (seqbuf_list, start_list, 
2434                     sample_len, bsp_list,
2435                     row);
2436   }
2437   
2438   num_match = 0;
2439   for (aln_pos = start; aln_pos < aln_len && aln_pos <= stop; aln_pos++) {
2440     /* init lists */
2441     MemSet (chars_appearing, 0, sizeof (chars_appearing));
2442     for (row = 1; row <= num_rows; row++) {
2443       if (end_gap[row - 1]) {
2444         continue;
2445       }
2446       seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2447       if (seq_pos < 0) {
2448         if (start_gap[row - 1] || end_gap[row - 1]) {
2449           /* beginning/end gap - never counts against percent identity */
2450         } else {
2451           k = aln_pos + 1;
2452           while (k < aln_len && seq_pos < 0) {
2453             seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, k, row);
2454             k++;
2455           }
2456           if (seq_pos < 0) {
2457             /* now in end_gap for this sequence */
2458             end_gap[row - 1] = TRUE;
2459           } else {
2460             /* internal gaps count against percent identity when specified */
2461             if (internal_gaps) {
2462               chars_appearing[4] ++;
2463             }
2464           }
2465         }
2466       } else {
2467         start_gap[row - 1] = FALSE;
2468         
2469         row_ch = ReadFromAlignmentSample(seqbuf_list, start_list, 
2470                                          sample_len, bsp_list, strand_list,
2471                                          row - 1, seq_pos);
2472         switch (row_ch) {
2473           case 'A':
2474             chars_appearing[0]++;
2475             break;
2476           case 'T':
2477             chars_appearing[1]++;
2478             break;
2479           case 'G':
2480             chars_appearing[2]++;
2481             break;
2482           case 'C':
2483             chars_appearing[3]++;
2484             break;
2485           default:
2486             /* we don't count ambiguity characters */
2487             break;
2488        }
2489       }
2490     }
2491     max_app = 0;
2492     total_app = 0;
2493     for (i = 0; i < 4; i++) {
2494       if (chars_appearing[i] > max_app) {
2495         max_app = chars_appearing[i];
2496       }
2497       total_app += chars_appearing[i];
2498     }
2499     /* add in internal gaps */
2500     total_app += chars_appearing[4];
2501     if (total_app > 0) {
2502       pct_ids[aln_pos - start] = (double) max_app / (double) total_app;
2503     }
2504     col_count++;
2505   }
2506   
2507   for (row = 0; row < num_rows; row++) {
2508     sip_list[row] = SeqIdFree(sip_list[row]);
2509     BioseqUnlock(bsp_list[row]);
2510   }
2511   sip_list = MemFree (sip_list);
2512   bsp_list = MemFree (bsp_list);
2513   start_gap = MemFree (start_gap);
2514   end_gap = MemFree (end_gap);
2515   start_list = MemFree (start_list);
2516   seqbuf_list = MemFree (seqbuf_list);
2517       
2518   return pct_ids;  
2519 }
2520 
2521 
2522 static Uint2 AlignmentPercentIdentityEx (SeqAlignPtr salp, Boolean internal_gaps, Boolean internal_validation)
2523 {
2524   Int4       aln_len, num_rows, row, col_count = 0;
2525   Int4       num_match;
2526   Uint2      pcnt;
2527   Boolean    row_match;
2528   Int4       aln_pos, seq_pos, tmp;
2529   Uint1          seq_ch, row_ch, amb_match;
2530   SeqEntryPtr    oldscope;
2531   SeqIdPtr PNTR  sip_list;
2532   BioseqPtr PNTR bsp_list;
2533   Uint1Ptr       strand_list;
2534   Int4Ptr        start_list;
2535   Uint1Ptr       seqbuf_list;
2536   Int4           sample_len = 50;
2537   Int4Ptr        starts, stops;
2538   
2539   if (salp == NULL) return 0;
2540  
2541   AlnMgr2IndexSingleChildSeqAlign(salp);
2542   aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2543   num_rows = AlnMgr2GetNumRows(salp);
2544   if (num_rows < 0) {
2545     if (! internal_validation) {
2546       Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2547     }
2548     return 0;
2549   }
2550   bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2551   sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2552   strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2553   starts = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
2554   stops = (Int4Ptr) MemNew (num_rows * sizeof (Int4));
2555   for (row = 1; row <= num_rows; row++) {
2556     sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2557     strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2558     bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2559     if (bsp_list[row - 1] == NULL) {
2560       oldscope = SeqEntrySetScope (NULL);
2561       bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2562       SeqEntrySetScope(oldscope);
2563       if (bsp_list[row - 1] == NULL) {
2564         break;
2565       }
2566     }
2567     /* get endpoints for each row */
2568     AlnMgr2GetNthSeqRangeInSA(salp, row, starts + row - 1, stops + row - 1);
2569     starts[row - 1] = AlnMgr2MapBioseqToSeqAlign (salp, starts[row - 1], row);
2570     stops[row - 1] = AlnMgr2MapBioseqToSeqAlign (salp, stops[row - 1], row);
2571     if (starts[row - 1] > stops[row - 1]) {
2572       tmp = starts[row - 1];
2573       starts[row - 1] = stops[row - 1];
2574       stops[row - 1] = tmp;
2575     }
2576 
2577   }
2578   
2579   if (row <= num_rows) {
2580     if (! internal_validation) {
2581       Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2582     }
2583     while (row > 0) {
2584       sip_list[row - 1] = SeqIdFree(sip_list[row - 1]);
2585       BioseqUnlock(bsp_list[row - 1]);
2586       row--;
2587     }
2588     sip_list = MemFree (sip_list);
2589     bsp_list = MemFree (bsp_list);
2590     starts = MemFree (starts);
2591     stops = MemFree (stops);
2592     return 0;
2593   }
2594   
2595   start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2596   seqbuf_list = (Uint1Ptr) MemNew ((num_rows * sample_len + 1) * sizeof(Uint1));
2597   for (row = 0; row < num_rows; row++) {
2598     start_list[row] = 0;
2599     PopulateSample (seqbuf_list, start_list, 
2600                     sample_len, bsp_list,
2601                     row);
2602   }
2603   
2604   num_match = 0;
2605   for (aln_pos = 0; aln_pos < aln_len; aln_pos++) {
2606     row_match = TRUE;
2607     seq_ch = 0;
2608     for (row = 1; row <= num_rows; row++) {
2609       if (aln_pos < starts[row - 1] || aln_pos > stops[row - 1]) {
2610         continue;
2611       }
2612       seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2613       if (seq_pos < 0) {
2614         if (internal_gaps) {
2615           row_match = FALSE;
2616         }
2617       } else {        
2618         row_ch = ReadFromAlignmentSample(seqbuf_list, start_list, 
2619                                          sample_len, bsp_list, strand_list,
2620                                          row - 1, seq_pos);
2621         if (row_ch == 'N') {
2622           /* do nothing - Ns do not count against percent identity */
2623         } else if (seq_ch == 0) {
2624           seq_ch = row_ch;
2625         } else if (seq_ch != row_ch) {
2626           amb_match = AmbiguousMatch (seq_ch, row_ch);
2627           if (amb_match == 0) {
2628             row_match = FALSE;
2629           } else {
2630             seq_ch = amb_match;
2631           }
2632         }
2633       }
2634     }
2635     if (row_match) {
2636       num_match++;
2637     }
2638     col_count++;
2639   }
2640   
2641   for (row = 0; row < num_rows; row++) {
2642     sip_list[row] = SeqIdFree(sip_list[row]);
2643     BioseqUnlock(bsp_list[row]);
2644   }
2645   sip_list = MemFree (sip_list);
2646   bsp_list = MemFree (bsp_list);
2647   starts = MemFree (starts);
2648   stops = MemFree (stops);
2649   start_list = MemFree (start_list);
2650   seqbuf_list = MemFree (seqbuf_list);
2651       
2652   if (col_count == 0) {
2653       pcnt = 0;
2654   } else {
2655       pcnt = (100 * num_match) / col_count;
2656   }
2657   return pcnt;
2658 }
2659 
2660 extern Uint2 AlignmentPercentIdentity (SeqAlignPtr salp, Boolean internal_gaps)
2661 {
2662   return AlignmentPercentIdentityEx (salp, internal_gaps, FALSE);
2663 }
2664 
2665 extern Uint2 WeightedAlignmentPercentIdentity (SeqAlignPtr salp, Boolean internal_gaps)
2666 {
2667   Int4       aln_len, num_rows, row, col_count = 0;
2668   Int4       num_match;
2669   Uint2      pcnt;
2670   Int4       aln_pos, seq_pos, k;
2671   Uint1          row_ch;
2672   SeqEntryPtr    oldscope;
2673   SeqIdPtr PNTR  sip_list;
2674   BioseqPtr PNTR bsp_list;
2675   Uint1Ptr       strand_list;
2676   BoolPtr        start_gap, end_gap;
2677   Int4Ptr        start_list;
2678   Uint1Ptr       seqbuf_list;
2679   Int4           sample_len = 50;
2680   Int4           chars_appearing[5]; /* 0 is A, 1 is T, 2 is G, 3 is C, 4 is internal gap */
2681   double         col_pct, col_pct_total = 0;
2682   Int4           max_app, total_app, i;
2683   
2684   if (salp == NULL) return 0;
2685  
2686   AlnMgr2IndexSingleChildSeqAlign(salp);
2687   aln_len = AlnMgr2GetAlnLength(salp, FALSE);
2688   num_rows = AlnMgr2GetNumRows(salp);
2689   if (num_rows < 0) {
2690     Message (MSG_POSTERR, "AlnMgr2GetNumRows failed");
2691     return 0;
2692   }
2693   bsp_list = (BioseqPtr PNTR) MemNew (num_rows * sizeof (BioseqPtr));
2694   sip_list = (SeqIdPtr PNTR) MemNew (num_rows * sizeof(SeqIdPtr));
2695   strand_list = (Uint1Ptr) MemNew (num_rows * sizeof(Uint1));
2696   start_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2697   end_gap = (BoolPtr) MemNew (num_rows * sizeof(Boolean));
2698   for (row = 1; row <= num_rows; row++) {
2699     sip_list[row - 1] = AlnMgr2GetNthSeqIdPtr(salp, row);
2700     strand_list[row - 1] = AlnMgr2GetNthStrand(salp, row);
2701     bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2702     if (bsp_list[row - 1] == NULL) {
2703       oldscope = SeqEntrySetScope (NULL);
2704       bsp_list[row - 1] = BioseqLockById(sip_list[row - 1]);
2705       SeqEntrySetScope(oldscope);
2706       if (bsp_list[row - 1] == NULL) {
2707         break;
2708       }
2709     }
2710     start_gap[row - 1] = TRUE;
2711     end_gap[row - 1] = FALSE;
2712   }
2713   
2714   if (row <= num_rows) {
2715     Message (MSG_POSTERR, "Unable to locate Bioseq in alignment");
2716     while (row >= 0) {
2717       sip_list[row] = SeqIdFree(sip_list[row]);
2718       BioseqUnlock(bsp_list[row]);
2719       row--;
2720     }
2721     sip_list = MemFree (sip_list);
2722     bsp_list = MemFree (bsp_list);
2723     start_gap = MemFree (start_gap);
2724     end_gap = MemFree (end_gap);
2725     return 0;
2726   }
2727   
2728   start_list = (Int4Ptr) MemNew (num_rows * sizeof(Int4));
2729   seqbuf_list = (Uint1Ptr) MemNew (num_rows * sample_len * sizeof(Uint1));
2730   for (row = 0; row < num_rows; row++) {
2731     start_list[row] = 0;
2732     PopulateSample (seqbuf_list, start_list, 
2733                     sample_len, bsp_list,
2734                     row);
2735   }
2736   
2737   num_match = 0;
2738   for (aln_pos = 0; aln_pos < aln_len; aln_pos++) {
2739     /* init lists */
2740     MemSet (chars_appearing, 0, sizeof (chars_appearing));
2741     for (row = 1; row <= num_rows; row++) {
2742       if (end_gap[row - 1]) {
2743         continue;
2744       }
2745       seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
2746       if (seq_pos < 0) {
2747         if (start_gap[row - 1] || end_gap[row - 1]) {
2748           /* beginning/end gap - never counts against percent identity */
2749         } else {
2750           k = aln_pos + 1;
2751           while (k < aln_len && seq_pos < 0) {
2752             seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, k, row);
2753             k++;
2754           }
2755           if (seq_pos < 0) {
2756             /* now in end_gap for this sequence */
2757             end_gap[row - 1] = TRUE;
2758           } else {
2759             /* internal gaps count against percent identity when specified */
2760             if (internal_gaps) {
2761               chars_appearing[4] ++;
2762             }
2763           }
2764         }
2765       } else {
2766         start_gap[row - 1] = FALSE;
2767         
2768         row_ch = ReadFromAlignmentSample(seqbuf_list, start_list, 
2769                                          sample_len, bsp_list, strand_list,
2770                                          row - 1, seq_pos);
2771         switch (row_ch) {
2772           case 'A':
2773             chars_appearing[0]++;
2774             break;
2775           case 'T':
2776             chars_appearing[1]++;
2777             break;
2778           case 'G':
2779             chars_appearing[2]++;
2780             break;
2781           case 'C':
2782             chars_appearing[3]++;
2783             break;
2784           default:
2785             /* we don't count ambiguity characters */
2786             break;
2787        }
2788       }
2789     }
2790     max_app = 0;
2791     total_app = 0;
2792     for (i = 0; i < 4; i++) {
2793       if (chars_appearing[i] > max_app) {
2794         max_app = chars_appearing[i];
2795       }
2796       total_app += chars_appearing[i];
2797     }
2798     if (total_app > 0) {
2799       col_pct = (double) max_app / (double) total_app;
2800       col_pct_total += col_pct;
2801     }
2802     col_count++;
2803   }
2804   
2805   for (row = 0; row < num_rows; row++) {
2806     sip_list[row] = SeqIdFree(sip_list[row]);
2807     BioseqUnlock(bsp_list[row]);
2808   }
2809   sip_list = MemFree (sip_list);
2810   bsp_list = MemFree (bsp_list);
2811   start_gap = MemFree (start_gap);
2812   end_gap = MemFree (end_gap);
2813   start_list = MemFree (start_list);
2814   seqbuf_list = MemFree (seqbuf_list);
2815       
2816   if (col_count == 0) {
2817       pcnt = 0;
2818   } else {
2819       pcnt = (100 * col_pct_total) / col_count;
2820   }
2821   return pcnt;
2822 }
2823 
2824 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.