NCBI C Toolkit Cross Reference

C/api/aceread.c


  1 /*
  2  * $Id: aceread.c,v 1.16 2008/12/22 22:40:30 bollin Exp $
  3  *
  4  * ===========================================================================
  5  *
  6  *                            PUBLIC DOMAIN NOTICE
  7  *               National Center for Biotechnology Information
  8  *
  9  *  This software/database is a "United States Government Work" under the
 10  *  terms of the United States Copyright Act.  It was written as part of
 11  *  the author's official duties as a United States Government employee and
 12  *  thus cannot be copyrighted.  This software/database is freely available
 13  *  to the public for use. The National Library of Medicine and the U.S.
 14  *  Government have not placed any restriction on its use or reproduction.
 15  *
 16  *  Although all reasonable efforts have been taken to ensure the accuracy
 17  *  and reliability of the software and data, the NLM and the U.S.
 18  *  Government do not and cannot warrant the performance or results that
 19  *  may be obtained by using this software or data. The NLM and the U.S.
 20  *  Government disclaim all warranties, express or implied, including
 21  *  warranties of performance, merchantability or fitness for any particular
 22  *  purpose.
 23  *
 24  *  Please cite the author in any work or product based on this material.
 25  *
 26  * ===========================================================================
 27  *
 28  * Authors:  Colleen Bollin
 29  *
 30  */
 31 
 32 
 33 #include <stdlib.h>
 34 #include <stdio.h>
 35 #include <string.h>
 36 #include <ctype.h>
 37 #include <util/creaders/alnread.h>
 38 #include <aceread.h>
 39 
 40 
 41 typedef enum {
 42     eTrue = -1,
 43     eFalse = 0
 44 } EBool;
 45 
 46 
 47 typedef enum {
 48     eJustRight = 0,
 49     eNone,
 50     eTooMany,
 51     eTooFew, 
 52     eUnexpected
 53 } EFound;
 54 
 55 
 56 extern void PrintACEFormatErrorXMLStart (char *id, char *has_errors)
 57 {
 58     if (has_errors != NULL) {
 59         if (*has_errors == 0) {
 60             printf ("<aceread>\n");
 61             *has_errors = 1;
 62         }
 63     }
 64     printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">", id == NULL ? "No ID" : id);
 65 }
 66 
 67 
 68 extern void PrintACEFormatErrorXMLEnd (void)
 69 {
 70     printf ("</message>\n");
 71 }
 72 
 73 
 74 extern void PrintACEFormatErrorXML (char *msg, char *id, char *has_errors)
 75 {
 76     if (has_errors != NULL) {
 77         if (*has_errors == 0) {
 78             printf ("<aceread>\n");
 79             *has_errors = 1;
 80         }
 81     }
 82     printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">%s</message>\n", id == NULL ? "No ID" : id, msg);
 83 }
 84 
 85 
 86 static void s_ReportFound (EFound val, char *label, char *id, char *has_errors)
 87 {
 88     switch (val) {
 89         case eNone:
 90             PrintACEFormatErrorXMLStart (id, has_errors);
 91             printf ("Found no %s", label);
 92             PrintACEFormatErrorXMLEnd ();
 93             break;
 94         case eTooMany:
 95             PrintACEFormatErrorXMLStart (id, has_errors);
 96             printf ("Too many %s", label);
 97             PrintACEFormatErrorXMLEnd ();
 98             break;
 99         case eTooFew:
100             PrintACEFormatErrorXMLStart (id, has_errors);
101             printf ("Too few %s", label);
102             PrintACEFormatErrorXMLEnd ();
103             break;
104         case eUnexpected:
105             PrintACEFormatErrorXMLStart (id, has_errors);
106             printf ("Unexpected character while reading %s", label);
107             PrintACEFormatErrorXMLEnd ();
108             break;
109         case eJustRight:
110             break;
111         default:
112             PrintACEFormatErrorXML ("Unknown error", id, has_errors);
113             break;
114     }
115 }
116 
117 
118 extern TGapInfoPtr GapInfoNew (void)
119 {
120     TGapInfoPtr g;
121 
122     g = (TGapInfoPtr) malloc (sizeof (SGapInfo));
123     if (g != NULL) {
124         g->num_gaps = 0;
125         g->gap_offsets = NULL;
126     }
127     return g;
128 }
129 
130 
131 extern void GapInfoFree (TGapInfoPtr g)
132 {
133     if (g != NULL) {
134         free (g->gap_offsets);
135         free (g);
136     }
137 }
138 
139 
140 static int s_IsGapChar (char ch, char *gap_chars)
141 {
142     if (ch == 0 || gap_chars == NULL) {
143         return 0;
144     }
145     while (*gap_chars != 0 && *gap_chars != ch) {
146         gap_chars ++;
147     }
148     if (*gap_chars == ch) {
149         return 1;
150     } else {
151         return 0;
152     }
153 }
154 
155 
156 /* The Trace Archive Gap String is a list of the number of nucleotides to skip before adding the next gap */
157 extern TGapInfoPtr GapInfoFromSequenceString (char *seq_str, char *gap_chars)
158 { 
159     char * cp;
160     int    num_gaps = 0, pos, gap_num = 0;
161     TGapInfoPtr g = NULL;
162 
163     if (seq_str == NULL) return NULL;
164 
165     /* first determine number of gaps */
166     cp = seq_str;
167     while (*cp != 0) {
168         if (s_IsGapChar(*cp, gap_chars)) {
169             num_gaps++;
170         }
171         cp++;
172     }
173 
174     g = GapInfoNew ();
175     if (num_gaps > 0) {
176         g->num_gaps = num_gaps;
177         g->gap_offsets = malloc (g->num_gaps * sizeof (int));
178         cp = seq_str;
179         pos = 0;
180         while (*cp != 0) {
181             if (s_IsGapChar(*cp, gap_chars)) {
182                 g->gap_offsets[gap_num] = pos;
183                 gap_num++;
184                 pos = 0;
185             } else {
186                 pos++;
187             }
188             cp++;
189         }
190     }
191     return g;
192 }
193 
194 extern void RemoveGapCharsFromSequenceString (char *seq_str, char *gap_chars)
195 {
196     char *cp_src, *cp_dst;
197 
198     if (seq_str == NULL || gap_chars == NULL) {
199       return;
200     }
201 
202     cp_src = seq_str;
203     cp_dst = seq_str;
204     while (*cp_src != 0) {
205         if (!s_IsGapChar(*cp_src, gap_chars)) {
206             *cp_dst = *cp_src;
207             cp_dst++;
208         }
209         cp_src++;
210     }
211 }
212 
213 
214 /* calculate sequence position from tiling position (both values zero-based) given gap_info */
215 extern int SeqPosFromTilingPos (int tiling_pos, TGapInfoPtr gap_info)
216 {
217     int pos = 0, seq_pos = 0, gap_num = 0;
218 
219     if (tiling_pos < 0 || gap_info == NULL || gap_info->num_gaps == 0) {
220         return tiling_pos;
221     }
222     
223     while (gap_num < gap_info->num_gaps && pos + gap_info->gap_offsets[gap_num] <= tiling_pos) {
224         seq_pos += gap_info->gap_offsets[gap_num];
225         pos += gap_info->gap_offsets[gap_num] + 1;
226         gap_num++;
227     }
228     seq_pos += tiling_pos - pos;
229     return seq_pos;
230 }
231 
232 
233 /* calculate sequence position from tiling position (both values zero-based) given gap_info */
234 extern int TilingPosFromSeqPos (int seq_pos, TGapInfoPtr gap_info)
235 {
236     int pos = 0, tiling_pos = 0, gap_num = 0;
237 
238     if (seq_pos < 0 || gap_info == NULL || gap_info->num_gaps == 0) {
239         return seq_pos;
240     }
241 
242     while (gap_num < gap_info->num_gaps && pos + gap_info->gap_offsets[gap_num] <= seq_pos) {
243         pos += gap_info->gap_offsets[gap_num];
244         tiling_pos += gap_info->gap_offsets[gap_num] + 1;
245         gap_num++;
246     }
247     tiling_pos += seq_pos - pos;
248     return tiling_pos;
249 }
250 
251 
252 /* adjust gap info when sequence is trimmed */
253 static void AdjustGapInfoFor5Trim (TGapInfoPtr gap_info, int trim)
254 {
255     int pos = 0;
256     int num_gaps = 0;
257     int i;
258 
259     if (gap_info == NULL || gap_info->num_gaps < 1 || trim < 1) {
260         return;
261     }
262 
263     while (num_gaps < gap_info->num_gaps && pos + gap_info->gap_offsets[num_gaps] < trim) {
264         pos += gap_info->gap_offsets[num_gaps];
265         num_gaps++;
266     }
267     if (num_gaps < gap_info->num_gaps) {
268         gap_info->gap_offsets[num_gaps] -= trim - pos;
269         for (i = num_gaps; i < gap_info->num_gaps; i++) {
270             gap_info->gap_offsets[i - num_gaps] = gap_info->gap_offsets[i];
271         }
272         gap_info->num_gaps -= num_gaps;
273     } else {
274         free (gap_info->gap_offsets);
275         gap_info->gap_offsets = NULL;
276         gap_info->num_gaps = 0;
277     }
278 
279 }
280 
281 
282 static void AdjustGapInfoFor3Trim (TGapInfoPtr gap_info, int new_len)
283 {
284     int pos = 0;
285     int num_gaps = 0;
286 
287     if (gap_info == NULL || gap_info->num_gaps < 1) {
288         return;
289     }
290 
291     while (num_gaps < gap_info->num_gaps && pos + gap_info->gap_offsets[num_gaps] < new_len) {
292         pos += gap_info->gap_offsets[num_gaps];
293         num_gaps++;
294     }
295     if (num_gaps < gap_info->num_gaps) {
296         gap_info->num_gaps = num_gaps;
297     }
298 }
299 
300 /* TODO: NEED TO write function for truncating on right, test function for truncating on left */
301 
302 extern TContigReadPtr ContigReadNew (void)
303 {
304     TContigReadPtr r;
305 
306     r = (TContigReadPtr) malloc (sizeof (SContigRead));
307     if (r == NULL) {
308         return NULL;
309     }
310     r->read_id = NULL;
311     r->ti = 0;
312     r->srr = NULL;
313     r->read_seq = NULL;
314     r->is_complement = 0;
315     r->cons_start = 0;
316     r->cons_stop = 0;
317     r->gaps = NULL;
318     r->local = 1;
319     r->valid = 0;
320     r->qual_scores = NULL;
321     r->num_qual_scores = 0;
322     r->tag = NULL;
323     return r;
324 }
325 
326 
327 extern void ContigReadFree (TContigReadPtr r)
328 {
329     if (r != NULL) {
330         if (r->read_id != NULL) {
331             free (r->read_id);
332         }
333         if (r->srr != NULL) {
334             free (r->srr);
335         }
336         if (r->read_seq != NULL) {
337             free (r->read_seq);
338         }
339         if (r->gaps != NULL) {
340             GapInfoFree (r->gaps);
341         }
342         if (r->qual_scores != NULL) {
343             free (r->qual_scores);
344         }
345         if (r->tag != NULL) {
346             free (r->tag);
347         }
348         free (r);
349     }
350 }
351 
352 
353 extern TBaseSegPtr BaseSegNew (void)
354 {
355     TBaseSegPtr b;
356 
357     b = (TBaseSegPtr) malloc (sizeof (SBaseSeg));
358     if (b == NULL) {
359         return NULL;
360     }
361     b->read_id = NULL;
362     b->cons_start = 0;
363     b->cons_stop = 0;
364     return b;
365 }
366 
367 
368 extern void BaseSegFree (TBaseSegPtr b)
369 {
370     if (b != NULL) {
371         if (b->read_id != NULL) {
372             free (b->read_id);
373         }
374         free (b);
375     }
376 }
377 
378 
379 /* reads a correctly formatted line and creates a base seg.
380  */
381 static TBaseSegPtr s_ReadBaseSeg (char *line)
382 {
383     TBaseSegPtr base_seg = NULL;
384     char *cp;
385     int   start, stop, len;
386 
387     if (line == NULL || *line != 'B' || *(line + 1) != 'S') {
388         return NULL;
389     }
390 
391 
392     cp = line + 2;
393     while (isspace (*cp)) {
394         cp++;
395     }
396     if (!isdigit (*cp)) {
397         return NULL;
398     }
399     start = atoi (cp);
400     while (isdigit (*cp)) {
401         cp++;
402     }
403     while (isspace (*cp)) {
404         cp++;
405     }
406     if (!isdigit (*cp)) {
407         return NULL;
408     }
409     stop = atoi (cp);
410     while (isdigit (*cp)) {
411         cp++;
412     }
413     while (isspace (*cp)) {
414         cp++;
415     }
416     if (*cp == 0) {
417         return NULL;
418     }
419 
420     len = strlen (cp);
421 
422     base_seg = BaseSegNew ();
423     base_seg->cons_start = start;
424     base_seg->cons_stop = stop;
425     base_seg->read_id = malloc (sizeof (char) * len + 1);
426     strcpy (base_seg->read_id, cp);
427 
428     return base_seg;
429 }
430 
431 
432 extern TConsensusReadAlnPtr ConsensusReadAlnNew (int numseg)
433 {
434     TConsensusReadAlnPtr a;
435     int i;
436 
437     a = (TConsensusReadAlnPtr) malloc (sizeof (SConsensusReadAln));
438     a->is_complement = 0;
439     if (numseg < 1) {
440         a->lens = NULL;
441         a->cons_starts = NULL;
442         a->read_starts = NULL;
443         a->numseg = 0;
444     } else {
445         a->lens = (int *) malloc (sizeof (int) * numseg);
446         a->cons_starts = (int *) malloc (sizeof (int) * numseg);
447         a->read_starts = (int *) malloc (sizeof (int) * numseg);
448         for (i = 0; i < numseg; i++) {
449             a->lens[i] = 0;
450             a->cons_starts[i] = 0;
451             a->read_starts[0] = 0;
452         }
453         a->numseg = numseg;
454     }
455     return a;
456 }
457 
458 
459 extern TConsensusReadAlnPtr ConsensusReadAlnFree (TConsensusReadAlnPtr a)
460 {
461     if (a != NULL) {
462         if (a->lens != NULL) {
463             free (a->lens);
464             a->lens = NULL;
465         }
466         if (a->cons_starts != NULL) {
467             free (a->cons_starts);
468             a->cons_starts = NULL;
469         }
470         if (a->read_starts != NULL) {
471             free (a->read_starts);
472             a->read_starts = NULL;
473         }
474         free (a);
475         a = NULL;
476     }
477     return a;
478 }
479 
480 
481 extern TConsensusReadAlnPtr GetConsensusReadAln (char *consensus_seq, TContigReadPtr read)
482 {
483     TConsensusReadAlnPtr aln = NULL;
484     char *c;
485     char *c_start;
486     char *r;
487     char *r_start;
488     int numseg = 0, aln_len, pos, seg, con_offset = 0, read_offset = 0;
489     char con_gap_open = 0, read_gap_open = 0, gap_change;
490 
491     if (consensus_seq == NULL || read == NULL) {
492         return NULL;
493     }
494 
495     if (read->cons_start > 0) {
496         c_start = consensus_seq + read->cons_start;
497         r_start = read->read_seq + read->read_assem_start - 1;
498     } else {
499         c_start = consensus_seq;
500         r_start = read->read_seq + read->read_assem_start - 1;
501     }
502 
503     aln_len = read->cons_stop - read->cons_start + 1;
504     while (*c_start == '*' && *r_start == '*') {
505         c_start++;
506         r_start++;
507         aln_len--;
508     }
509 
510     /* first, count number of segments needed */
511     c = c_start;
512     r = r_start;
513     if (*c != '*' && *r != '*') {
514       numseg++;
515     }
516     pos = 0;
517     while (*c != 0 && *r != 0 && pos < aln_len) {
518         if (*c == '*' && *r == '*') {
519             /* both in gap - ignore */
520         } else {
521             gap_change = 0;
522             if (*c == '*') {
523                 if (!con_gap_open) {
524                     gap_change = 1;
525                     con_gap_open = 1;
526                 }
527             } else {
528                 if (con_gap_open) {
529                     gap_change = 1;
530                     con_gap_open = 0;
531                 }
532             }
533             if (*r == '*') {
534                 if (!read_gap_open) {
535                     gap_change = 1;
536                     read_gap_open = 1;
537                 }
538             } else {
539                 if (read_gap_open) {
540                     gap_change = 1;
541                     read_gap_open = 0;
542                 }
543             }
544             if (gap_change) {
545                 numseg++;
546             }
547         }
548         c++;
549         r++;
550         pos++;
551     }
552 
553     /* create alignment */
554     aln = ConsensusReadAlnNew (numseg);
555     pos = 0;
556     seg = 0;
557 
558 
559     c = consensus_seq;        
560     while (c < c_start) {
561         if (*c != '*') {
562             con_offset ++;
563         }
564         c++;
565     }
566 
567     r = read->read_seq;
568     while (r < r_start) {
569         if (*r != '*') {
570             read_offset ++;
571         }
572         r++;
573     }
574     
575     
576     if (*c_start == '*') {
577         aln->cons_starts[0] = -1;
578         con_gap_open = 1;
579     } else {
580         aln->cons_starts[0] = con_offset;
581         con_gap_open = 0;
582     }
583 
584     if (*r_start == '*') {
585         aln->read_starts[0] = -1;
586         read_gap_open = 1;
587     } else {
588         aln->read_starts[0] = read_offset;
589         read_gap_open = 0;
590     }
591 
592     c = c_start + 1;
593     r = r_start + 1;
594     aln->lens[0] = 1;
595     pos = 1;
596     
597     while (*c != 0 && *r != 0 && pos < aln_len) {
598         if (*c == '*' && *r == '*') {
599             /* both in gap - ignore */
600         } else {
601             gap_change = 0;
602             if (*c == '*') {
603                 if (!con_gap_open) {
604                     gap_change = 1;
605                     con_gap_open = 1;
606                 }
607             } else {
608                 if (con_gap_open) {
609                     gap_change = 1;
610                     con_gap_open = 0;
611                 }
612             }
613             if (*r == '*') {
614                 if (!read_gap_open) {
615                     gap_change = 1;
616                     read_gap_open = 1;
617                 }
618             } else {
619                 if (read_gap_open) {
620                     gap_change = 1;
621                     read_gap_open = 0;
622                 }
623             }
624             if (gap_change) {
625                 seg++;
626                 if (con_gap_open) {
627                     aln->cons_starts[seg] = -1;
628                 } else if (aln->cons_starts[seg - 1] > -1) {
629                     aln->cons_starts[seg] = aln->cons_starts[seg - 1] + aln->lens[seg - 1];
630                 } else if (seg > 1 && aln->cons_starts[seg - 2] > -1) {
631                     aln->cons_starts[seg] = aln->cons_starts[seg - 2] + aln->lens[seg - 2];
632                 } else {
633                     aln->cons_starts[seg] = con_offset;
634                 }
635                 if (read_gap_open) {
636                     aln->read_starts[seg] = -1;
637                 } else if (aln->read_starts[seg - 1] > -1) {
638                     aln->read_starts[seg] = aln->read_starts[seg - 1] + aln->lens[seg - 1];
639                 } else if (seg > 1 && aln->read_starts[seg - 2] > -1) {
640                     aln->read_starts[seg] = aln->read_starts[seg - 2] + aln->lens[seg - 2];
641                 } else {
642                     aln->read_starts[seg] = read_offset;
643                 }
644             }
645             aln->lens[seg]++;
646         }
647         c++;
648         r++;
649         pos++;
650     }
651 
652     /* todo - adjust starts for complement */
653     if (read->is_complement) {
654       for (seg = 0; seg < aln->numseg; seg++) {
655         if (aln->read_starts[seg] > -1) {
656           aln->read_starts[seg] = read->read_len - aln->read_starts[seg] - aln->lens[seg];
657         }
658       }
659       aln->is_complement = 1;
660     }
661 
662     return aln;
663 }
664 
665 
666 extern TContigPtr ContigNew (void)
667 {
668     TContigPtr c;
669 
670     c = (TContigPtr) malloc (sizeof (SContig));
671     if (c == NULL) {
672         return NULL;
673     }
674     c->consensus_id = NULL;
675     c->consensus_seq = NULL;
676     c->consensus_assem_len = 0;
677     c->consensus_seq_len = 0;
678     c->is_complement = 0;
679     c->num_qual_scores = 0;
680     c->qual_scores = NULL;
681     c->num_reads = 0;
682     c->reads = NULL;
683     c->gaps = NULL;
684     c->num_reads = 0;
685     c->reads = NULL;
686     c->num_base_segs = 0;
687     c->base_segs = NULL;
688     c->tag = NULL;
689 
690     return c;
691 }
692 
693 
694 extern void ContigFree (TContigPtr c)
695 {
696     int i;
697 
698     if (c != NULL) {
699         if (c->consensus_id != NULL) free (c->consensus_id);
700         if (c->consensus_seq != NULL) free (c->consensus_seq);
701         if (c->qual_scores != NULL) free (c->qual_scores);
702             
703         if (c->reads != NULL) {
704             for (i = 0; i < c->num_reads; i++) {
705                 if (c->reads[i] != NULL) {
706                     ContigReadFree (c->reads[i]);
707                 }
708             }
709             free (c->reads);
710         }
711         if (c->base_segs != NULL) {
712             for (i = 0; i < c->num_base_segs; i++) {
713                 if (c->base_segs[i] != NULL) {
714                     BaseSegFree (c->base_segs[i]);
715                 }
716             }
717             free (c->base_segs);
718         }
719         if (c->tag != NULL) {
720             free (c->tag);
721         }
722         free (c);
723     }
724 }
725 
726 
727 extern TACEFilePtr ACEFileNew ()
728 {
729     TACEFilePtr afp;
730 
731     afp = (TACEFilePtr) malloc (sizeof (SACEFile));
732     if (afp == NULL) {
733         return NULL;
734     }
735     afp->num_contigs = 0;
736     afp->contigs = NULL;
737 
738     return afp;
739 }
740 
741 
742 extern void ACEFileFree (TACEFilePtr afp)
743 {
744     int i;
745 
746     if (afp != NULL) {
747         for (i = 0; i < afp->num_contigs; i++) {
748             ContigFree (afp->contigs[i]);
749         }
750         free (afp->contigs);
751         free (afp);
752     }      
753 }
754 
755 
756 static char s_IsSeqChar (char ch)
757 {
758     if (ch == '*' || isalpha (ch)) {
759         return 1;
760     } else {
761         return 0;
762     }
763 }
764 
765 
766 static char s_IsEOF (char *linestring)
767 {
768     if (linestring == NULL || linestring [0] == EOF) {
769         return 1;
770     } else {
771         return 0;
772     }
773 }
774 
775 
776 static char *
777 s_ReadSequenceFromFile
778 (int                  len, 
779  FReadLineFunction    readfunc,
780  void *               userdata,
781  char *               id,
782  char *               has_errors)
783 {
784     char *seq;
785     char *linestring;
786     char *cp;
787     int  pos = 0;
788 
789     /* copy in sequence data */
790     seq = malloc (len + 1);
791     linestring = readfunc (userdata);
792     while (!s_IsEOF (linestring) && s_IsSeqChar (linestring [0])) {
793         /* append to consensus */
794         cp = linestring;
795         while (s_IsSeqChar (*cp) && pos < len) {
796             if (isalpha (*cp)) {
797                 seq [pos] = toupper (*cp);
798             } else {
799                 seq [pos] = *cp;
800             }
801             pos++;
802             cp++;
803         }
804         if (s_IsSeqChar (*cp)) {
805             PrintACEFormatErrorXML ("Too many sequence characters!", id, has_errors);
806             free (seq);
807             return NULL;
808         }
809         free (linestring);
810         linestring = readfunc (userdata);
811     }
812     free (linestring);
813     if (pos < len) {
814         PrintACEFormatErrorXML ("Too few sequence characters!", id, has_errors);
815         free (seq);
816         seq = NULL;
817     } else {
818         seq[pos] = 0;
819     }
820     return seq;
821 }
822 
823 
824 static char s_LineIsEmptyButNotEof (char *linestring)
825 {
826     char *cp;
827     if (s_IsEOF (linestring)) {
828         return 0;
829     } 
830 
831     cp = linestring;
832     while (*cp != 0 && isspace (*cp)) {
833         cp++;
834     }
835     if (*cp == 0) {
836         return 1;
837     } else {
838         return 0;
839     }
840 }
841 
842 
843 static void s_SkipQualScores
844 (FReadLineFunction    readfunc,
845  void *               userdata)
846 {
847     char * linestring;
848     char * cp;
849     if (readfunc == NULL) return;
850 
851     linestring = readfunc (userdata);
852     while (s_LineIsEmptyButNotEof (linestring)) {
853         free (linestring);
854         linestring = readfunc (userdata);
855     }
856     if (linestring == NULL  ||  linestring [0] == EOF || strcmp (linestring, "BQ") != 0) {
857         return;
858     }
859     linestring = readfunc (userdata);
860     while (!s_IsEOF (linestring)
861            && isdigit (*(cp = linestring + strspn (linestring, " \t")))) {
862         free (linestring);   
863         linestring = readfunc (userdata);
864     }
865     free (linestring);
866 }
867 
868 
869 static EFound s_ReadQualScores
870 (TContigPtr contig,
871  FReadLineFunction    readfunc,
872  void *               userdata)
873 {
874     char * linestring;
875     char * cp;
876     int    pos;
877 
878     if (contig == NULL || readfunc == NULL || contig->consensus_assem_len == 0) {
879         return eNone;
880     }
881 
882     linestring = readfunc (userdata);
883     while (s_LineIsEmptyButNotEof (linestring)) {
884         free (linestring);
885         linestring = readfunc (userdata);
886     }
887     if (linestring == NULL  ||  linestring [0] == EOF || strcmp (linestring, "BQ") != 0) {
888         return eNone;
889     }
890 
891     /* read quality scores */
892     contig->num_qual_scores = contig->consensus_assem_len;
893     /* no score for * in consensus seq */
894     for (pos = 0; pos < contig->consensus_assem_len; pos++) {
895       if (contig->consensus_seq[pos] == '*') {
896         contig->num_qual_scores --;
897       }
898     }
899     contig->qual_scores = malloc (sizeof (int) * contig->num_qual_scores);
900     pos = 0;
901     linestring = readfunc (userdata);
902     while (!s_IsEOF (linestring)
903            && isdigit (*(cp = linestring + strspn (linestring, " \t")))) {
904         while (isdigit (*cp) && pos < contig->num_qual_scores) {
905             contig->qual_scores [pos] = atoi (cp);
906             pos++;
907             while (isdigit (*cp)) {
908                 cp++;
909             }
910             while (isspace (*cp)) {
911                 cp++;
912             }
913         }
914         if (isdigit (*cp)) {
915             return eTooMany;
916         }
917         free (linestring);   
918         linestring = readfunc (userdata);
919     }
920     if (pos < contig->num_qual_scores) {
921         return eTooFew;
922     } else {
923         return eJustRight;
924     }
925 }
926 
927 
928 static EFound s_ReadAFLines
929 (TContigPtr contig,
930  FReadLineFunction    readfunc,
931  void *               userdata,
932  char **              next_line)
933 {
934     char * linestring;
935     char * cp;
936     int    read_num, len;
937     EFound rval = eJustRight;
938 
939     if (contig == NULL || readfunc == NULL || contig->num_reads == 0) return eNone;
940 
941     /* get AF lines */
942     contig->reads = malloc (contig->num_reads * sizeof (TContigReadPtr));
943     linestring = readfunc (userdata);
944     while (s_LineIsEmptyButNotEof (linestring)) {
945         free (linestring);
946         linestring = readfunc (userdata);
947     }
948     if (linestring == NULL  ||  linestring [0] == EOF || strncmp (linestring, "AF", 2) != 0) {
949         *next_line = linestring;
950         return eNone;
951     }
952     
953     read_num = 0;
954     while (!s_IsEOF(linestring) && read_num < contig->num_reads
955            && linestring [0] == 'A' && linestring [1] == 'F' && isspace (linestring [2])) {
956         contig->reads[read_num] = ContigReadNew ();
957         len = strlen (linestring + 3);
958         contig->reads[read_num]->read_id = malloc (len + 1);
959         strcpy (contig->reads[read_num]->read_id, linestring + 3);
960         cp = contig->reads[read_num]->read_id;
961         while (*cp != 0 && !isspace (*cp)) {
962             cp++;
963         }
964         if (isspace (*cp)) {
965             *cp = 0;
966             cp++;
967         }
968         if (*cp == 'C') {
969             contig->reads[read_num]->is_complement = 1;
970         } else if (*cp != 'U') {
971             *next_line = linestring;
972             return eUnexpected;
973         }
974         cp++;
975         if (isspace (*cp)) {
976             cp++;
977         }
978         contig->reads[read_num]->cons_start = atoi (cp) - 1;
979         read_num++;
980         free (linestring);
981         linestring = readfunc (userdata);
982     }
983     if (read_num < contig->num_reads) {
984         rval = eTooFew;
985     } else if (!s_IsEOF(linestring) && strncmp (linestring, "AF ", 3) == 0) {
986         rval = eTooMany;
987     } else {
988         rval = eJustRight;
989     }
990     *next_line = linestring;
991     return rval;
992 }
993 
994 
995 static EFound s_ReadBaseSegs
996 (TContigPtr           contig,
997  int                  num_base_segs,
998  char *               firstline,
999  FReadLineFunction    readfunc,
1000  void *               userdata)
1001 {
1002     char * linestring;
1003 
1004     if (contig == NULL || readfunc == NULL || num_base_segs == 0) return eNone;
1005 
1006     contig->base_segs = malloc (sizeof (TBaseSegPtr) * num_base_segs);
1007     contig->num_base_segs = 0;
1008 
1009     /* get BS lines */
1010     linestring = firstline;
1011     while (s_LineIsEmptyButNotEof (linestring)) {
1012         free (linestring);
1013         linestring = readfunc (userdata);
1014     }
1015     if (linestring == NULL  ||  linestring [0] == EOF || strncmp (linestring, "BS", 2) != 0) {
1016         return eNone;
1017     }
1018     
1019     while (linestring != NULL  &&  linestring [0] != EOF && contig->num_base_segs < num_base_segs
1020            && linestring [0] == 'B' && linestring [1] == 'S' && isspace (linestring [2])) {
1021         contig->base_segs[contig->num_base_segs++] = s_ReadBaseSeg (linestring);
1022         free (linestring);
1023         linestring = readfunc (userdata);
1024     }
1025     if (contig->num_base_segs < num_base_segs) {
1026         return eTooFew;
1027     } else if (linestring != NULL && linestring [0] != EOF && ! s_LineIsEmptyButNotEof (linestring)) {
1028         return eTooMany;
1029     } else {
1030         return eJustRight;
1031     }
1032 }
1033 
1034 
1035 static char s_IsEquivN (char ch)
1036 {
1037     if (ch == 'N' || ch == 'X') {
1038         return 1;
1039     } else {
1040         return 0;
1041     }
1042 }
1043 
1044 
1045 /* Terminal Ns will always be trimmed in the GenBank records */
1046 static void s_AdjustContigReadForTerminalNs (TContigReadPtr read)
1047 {
1048     char * cp_src;
1049     char * cp_dst;
1050     int    len = 0;
1051 
1052     if (read == NULL || read->read_seq == NULL) return;
1053     cp_src = read->read_seq;
1054     while (s_IsEquivN(*cp_src)) {
1055         len++;
1056         cp_src++;
1057     }
1058     if (len > 0) {
1059         read->cons_start += len;
1060         cp_dst = read->read_seq;
1061         while (*cp_src != 0) {
1062             *cp_dst = *cp_src;
1063             cp_dst++;
1064             cp_src++;
1065         }
1066         *cp_dst = 0;
1067     }
1068     len = strlen (read->read_seq);
1069     cp_src = read->read_seq + len - 1;
1070     while (cp_src >= read->read_seq && s_IsEquivN(*cp_src)) {
1071         *cp_src = 0;
1072         cp_src--;
1073     }
1074 } 
1075 
1076 
1077 /* Terminal Ns will always be trimmed by the GenBank record */
1078 static void s_AdjustContigForTerminalNs (TContigPtr contig)
1079 {
1080     char * cp_src;
1081     char * cp_dst;
1082     int    len = 0, i;
1083 
1084     if (contig == NULL || contig->consensus_seq == NULL) return;
1085     cp_src = contig->consensus_seq;
1086     while (s_IsEquivN(*cp_src)) {
1087         len++;
1088         cp_src++;
1089     }
1090     if (len > 0) {
1091         /* adjust quality scores */
1092         if (contig->qual_scores != NULL) {
1093             contig->num_qual_scores -= len;
1094             for (i = 0; i < contig->num_qual_scores; i++) {
1095                 contig->qual_scores[i] = contig->qual_scores [i + len];
1096             }
1097         }
1098         /* adjust reads */
1099         if (contig->reads != NULL) {
1100             for (i = 0; i < contig->num_reads; i++) {
1101                 if (contig->reads[i] != NULL) {
1102                     contig->reads[i]->cons_start -= len;
1103                 }
1104             }
1105         }
1106         /* adjust consensus sequence */
1107         cp_dst = contig->consensus_seq;
1108         while (*cp_src != 0) {
1109             *cp_dst = *cp_src;
1110             cp_dst++;
1111             cp_src++;
1112         }
1113         *cp_dst = 0;
1114         contig->consensus_assem_len -= len;
1115     }
1116     /* trim 3' Ns */
1117     len = 0;
1118     cp_src = contig->consensus_seq + contig->consensus_assem_len - 1;
1119     while (cp_src >= contig->consensus_seq && s_IsEquivN(*cp_src)) {
1120         *cp_src = 0;
1121         cp_src--;
1122         contig->consensus_assem_len--;
1123         len++;
1124     }
1125     /* truncate quality scores if 3' Ns trimmed */
1126     if (len > 0 && contig->qual_scores != NULL) {
1127         contig->num_qual_scores -= len;
1128     }
1129 } 
1130 
1131 
1132 /* Clips the sequence read in according to the QA clipping.
1133  * The real coordinates will be recovered when an alignment is generated between
1134  * the sequence in the structure and the sequence downloaded from the Trace Archive.
1135  */
1136 static char ApplyQALineToRead (TContigReadPtr read, char *linestring, char *id, char *has_errors)
1137 {
1138     char *cp;
1139     int  values[4];
1140     int  i = 0;
1141 
1142     if (read == NULL || linestring == NULL) {
1143         PrintACEFormatErrorXML ("File end where QA line should be", id, has_errors);
1144         return 0;
1145     }
1146   
1147     cp = linestring;
1148     if (*cp != 'Q') {
1149         PrintACEFormatErrorXMLStart (id, has_errors);
1150         printf ("Expected QA line, found %s", linestring);
1151         PrintACEFormatErrorXMLEnd ();
1152         return 0;
1153     }
1154     cp++;
1155     if (*cp != 'A') {
1156         PrintACEFormatErrorXMLStart (id, has_errors);
1157         printf ("Expected QA line, found %s", linestring);
1158         PrintACEFormatErrorXMLEnd ();
1159         return 0;
1160     }
1161     cp++;
1162     while (*cp != 0 && i < 4) {
1163         while (isspace (*cp)) {
1164             cp++;
1165         }
1166         if (*cp != '-' && !isdigit (*cp)) {
1167           PrintACEFormatErrorXML ("Found non-number on QA line", id, has_errors);
1168           return 0;
1169         }
1170         values[i] = atoi (cp);
1171         i++;
1172         while (*cp == '-' || isdigit (*cp)) {
1173             cp++;
1174         }
1175     }
1176     if (*cp != 0 || i < 4) {
1177         PrintACEFormatErrorXML ("Fewer than four numbers on line", id, has_errors);
1178         return 0;
1179     }
1180     if (values[0] > 0 || values[2] > 0) {
1181         if (values[0] > values[2]) {
1182             read->read_assem_start = values[0];
1183         } else {
1184             read->read_assem_start = values[2];
1185         }
1186     }
1187 
1188     if (values[1] > 0 && values[3] > 0) {
1189         if (values[1] < values[3]) {
1190             read->read_assem_stop = values[1];
1191         } else {
1192             read->read_assem_stop = values[3];
1193         }
1194     } else if (values[1] > 0) {
1195         read->read_assem_stop = values[1];
1196     } else if (values[3] > 0) {
1197         read->read_assem_stop = values[3];
1198     }
1199 
1200     /* adjust first gap position for start */
1201     if (read->read_assem_start > 1 && read->gaps != NULL && read->gaps->num_gaps > 0 && read->gaps->gap_offsets != NULL) {
1202         read->gaps->gap_offsets[0] -= read->read_assem_start - 1;
1203     }
1204         
1205     return 1;
1206 }
1207     
1208 
1209 /* calculate gap info for consensus sequence */
1210 /* calculate cons_stop positions and tiling positions for each read */
1211 static void s_CalculateContigOffsets (TContigPtr contig)
1212 {
1213     int i;
1214 
1215     if (contig == NULL) return;
1216 
1217     for (i = 0; i < contig->num_reads; i++) {
1218         contig->reads[i]->tiling_start = contig->reads[i]->read_assem_start + contig->reads[i]->cons_start;
1219         contig->reads[i]->tiling_stop = contig->reads[i]->read_assem_stop + contig->reads[i]->cons_start;
1220         contig->reads[i]->cons_stop = SeqPosFromTilingPos (contig->reads[i]->tiling_stop - 1, contig->gaps) + 1;
1221         contig->reads[i]->read_start = SeqPosFromTilingPos(contig->reads[i]->read_assem_start - 1, contig->reads[i]->gaps) + 1;
1222         contig->reads[i]->read_stop = SeqPosFromTilingPos(contig->reads[i]->read_assem_stop - 1, contig->reads[i]->gaps) + 1;
1223     }
1224 
1225 }
1226 
1227 
1228 static int s_GetUngappedSeqLen (char *str, char *gap_chars)
1229 {
1230     int len = 0;
1231 
1232     if (str == NULL) return 0;
1233     while (*str != 0) {
1234         if (!s_IsGapChar (*str, gap_chars)) {
1235             len++;
1236         }
1237         str++;
1238     }
1239     return len;
1240 }
1241 
1242 
1243 static char * s_AddToTagComment (char *orig, char *extra)
1244 {
1245     char * tag = NULL;
1246     int    tag_len;
1247 
1248     if (orig == NULL) {
1249         tag = extra;
1250     } else {
1251         tag_len = strlen (orig) + strlen (extra) + 1;
1252         tag = malloc (sizeof (char) * (tag_len + 1));
1253         strcpy (tag, orig);
1254         strcat (tag, "\n");
1255         strcat (tag, extra);
1256         free (orig);
1257         free (extra);
1258     }
1259     return tag;
1260 }
1261 
1262 
1263 static char * s_ReadTagComment
1264 (FReadLineFunction    readfunc,
1265  void *               userdata)
1266 {
1267     char *linestring;
1268     char *tag = NULL;
1269     char *cp = NULL;
1270     char *tmp;
1271     int   tag_len = 0, end_len;
1272 
1273     linestring = readfunc (userdata);
1274     while (linestring != NULL  &&  linestring [0] != EOF && (cp = strchr (linestring, '}')) == NULL) {
1275         if (tag == NULL) {
1276             tag_len = strlen (linestring);
1277             tag = malloc (sizeof (char) * (tag_len + 1));
1278             strcpy (tag, linestring);
1279         } else {
1280             tag_len = tag_len + strlen (linestring) + 1;
1281             tmp = malloc (sizeof (char) * (tag_len + 1));
1282             strcpy (tmp, tag);
1283             strcat (tmp, "\n");
1284             strcat (tmp, linestring);
1285             free (tag);
1286             tag = tmp;
1287         }
1288         free (linestring);
1289         linestring = readfunc (userdata);
1290     }
1291     if (cp != NULL && cp > linestring) {
1292         end_len = cp - linestring;
1293         tag_len = tag_len + end_len + 1;
1294         tmp = malloc (sizeof (char) * (tag_len + 1));
1295         strcpy (tmp, tag);
1296         strcat (tmp, "\n");
1297         strncat (tmp, linestring, end_len);
1298         tmp[tag_len] = 0;
1299         free (tag);
1300         tag = tmp;
1301     }
1302     if (linestring != NULL) {
1303         free (linestring);
1304     }
1305 
1306     return tag;
1307 }
1308 
1309 
1310 /* Reads the portion of and ACE file for a single contig, including the reads */
1311 static TContigPtr s_ReadContig
1312 (char **              initline,
1313  FReadLineFunction    readfunc,
1314  void *               userdata,
1315  char                 make_qual_scores,
1316  char *               has_errors)
1317 {
1318     char      *linestring;
1319     char      *firstline;
1320     char      *cp;
1321     int        len = 0, read_num = 0, num_base_segs = 0;
1322     EFound     val;
1323     char       found_comp_char = 0;
1324     TContigPtr contig = NULL;
1325 
1326     if (initline == NULL) return NULL;
1327     firstline = *initline;
1328     if (firstline == NULL || readfunc == NULL) return NULL;
1329 
1330     if (firstline [0] != 'C' || firstline [1] != 'O' || ! isspace (firstline [2])) {
1331         return NULL;
1332     }
1333 
1334     contig = ContigNew ();
1335     len = strlen (firstline + 3);
1336     contig->consensus_id = malloc (len + 1);
1337     strcpy (contig->consensus_id, firstline + 3);
1338  
1339     cp = contig->consensus_id;
1340     while (*cp != 0 && !isspace (*cp)) {
1341         cp++;
1342     }
1343     if (isspace (*cp)) {
1344         *cp = 0;
1345         cp++;
1346         contig->consensus_assem_len = atoi (cp);
1347         while (isdigit (*cp)) {
1348             cp++;
1349         }
1350         if (isspace (*cp)) {
1351             cp++;
1352             contig->num_reads = atoi (cp);
1353             while (isdigit (*cp)) {
1354                 cp++;
1355             } 
1356             if (isspace (*cp)) {
1357                 cp++;
1358                 num_base_segs = atoi (cp);
1359                 while (isdigit (*cp)) {
1360                     cp++;
1361                 }
1362                 if (isspace (*cp)) {
1363                     cp++;
1364                     found_comp_char = 1;
1365                     if (*cp == 'C') {
1366                         contig->is_complement = 1;
1367                     } else {
1368                         contig->is_complement = 0;
1369                     }
1370                 } 
1371             }
1372         }
1373     }
1374     if (contig->consensus_assem_len == 0 || contig->num_reads == 0 || !found_comp_char) {
1375         PrintACEFormatErrorXML ("Error in consensus line", contig->consensus_id, has_errors);
1376         ContigFree (contig);
1377         return NULL;
1378     }
1379         
1380     /* now copy in sequence data */
1381     contig->consensus_seq = s_ReadSequenceFromFile (contig->consensus_assem_len, readfunc, userdata, contig->consensus_id, has_errors);
1382     if (contig->consensus_seq == NULL) {
1383         ContigFree (contig);
1384         return NULL;
1385     }
1386 
1387     /* record actual length of consensus seq */
1388     contig->consensus_seq_len = s_GetUngappedSeqLen (contig->consensus_seq, "*");
1389 
1390     /* calculate gap info */
1391     contig->gaps = GapInfoFromSequenceString (contig->consensus_seq, "*");
1392     
1393     /* read quality scores */
1394     if (make_qual_scores) {
1395         val = s_ReadQualScores (contig, readfunc, userdata);
1396         if (val != eNone && val != eJustRight) {
1397             s_ReportFound (val, "quality scores", contig->consensus_id, has_errors);
1398             ContigFree (contig);
1399             return NULL;
1400         }
1401     } else {
1402         s_SkipQualScores (readfunc, userdata);
1403     }
1404 
1405     /* collect reads */
1406     val = s_ReadAFLines (contig, readfunc, userdata, &linestring);
1407     if (val != eJustRight) {
1408         s_ReportFound (val, "AF lines", contig->consensus_id, has_errors);
1409         ContigFree (contig);
1410         if (linestring != NULL) free (linestring);
1411         return NULL;
1412     }
1413  
1414     if (num_base_segs > 0) {
1415         val = s_ReadBaseSegs (contig, num_base_segs, linestring, readfunc, userdata);
1416         if (val != eJustRight) {
1417             s_ReportFound (val, "base segments", contig->consensus_id, has_errors);
1418             ContigFree (contig);
1419             return NULL;
1420         }
1421     }
1422 
1423     
1424     read_num = 0;
1425     linestring = readfunc (userdata);
1426     while (linestring != NULL  &&  linestring [0] != EOF) {
1427         if (linestring [0] == 'R' && linestring[1] == 'D' && isspace (linestring [2])) {
1428             len = strlen (contig->reads[read_num]->read_id);
1429             if (strncmp (linestring + 3, contig->reads[read_num]->read_id, len) != 0
1430                 || !isspace (linestring [3 + len])) {
1431                 PrintACEFormatErrorXML ("Read IDs out of order!", contig->consensus_id, has_errors);
1432                 ContigFree (contig);
1433                 return NULL;
1434             } 
1435             len = atoi (linestring + 3 + len);
1436             contig->reads[read_num]->read_seq = s_ReadSequenceFromFile (len, readfunc, userdata, contig->reads[read_num]->read_id, has_errors);
1437             if (contig->reads[read_num]->read_seq == NULL) {
1438                 ContigFree (contig);
1439                 return NULL;
1440             }
1441             s_AdjustContigReadForTerminalNs (contig->reads[read_num]);
1442             contig->reads[read_num]->read_len = s_GetUngappedSeqLen (contig->reads[read_num]->read_seq, "*");
1443             contig->reads[read_num]->gaps = GapInfoFromSequenceString (contig->reads[read_num]->read_seq, "*");
1444             read_num++;
1445         } else if (linestring [0] == 'Q' && linestring[1] == 'A' && isspace (linestring[2])) {
1446             if (read_num < 1) {
1447                 PrintACEFormatErrorXML ("Found QA line before RD!", contig->consensus_id, has_errors);
1448                 ContigFree (contig);
1449                 return NULL;
1450             } else if (!ApplyQALineToRead (contig->reads[read_num - 1], linestring, contig->reads[read_num - 1]->read_id, has_errors)) {
1451                 PrintACEFormatErrorXML ("Error in QA line format!", contig->reads[read_num - 1]->read_id, has_errors);
1452                 ContigFree (contig);
1453                 return NULL;
1454             }
1455         } else if (linestring[0] == 'D' && linestring[1] == 'S' && isspace (linestring[2])) {
1456             /* skip DS lines */
1457         } else if (strncmp (linestring, "RT{", 3) == 0) {
1458             contig->reads[read_num - 1]->tag = s_AddToTagComment (contig->reads[read_num - 1]->tag, s_ReadTagComment (readfunc, userdata));
1459         } else if (strncmp (linestring, "WR{", 3) == 0) {
1460             contig->reads[read_num - 1]->tag = s_AddToTagComment (contig->reads[read_num - 1]->tag, s_ReadTagComment (readfunc, userdata));
1461         } else if (strncmp (linestring, "CT{", 3) == 0) {
1462             contig->tag = s_AddToTagComment (contig->tag, s_ReadTagComment (readfunc, userdata));
1463         } else if (strncmp (linestring, "WA{", 3) == 0) {
1464             contig->tag = s_AddToTagComment (contig->tag, s_ReadTagComment (readfunc, userdata));
1465         } else if (linestring[0] != 0) {
1466             /* found next line */
1467             *initline = linestring;
1468             s_AdjustContigForTerminalNs (contig);
1469             s_CalculateContigOffsets (contig);
1470             return contig;
1471         }
1472         free (linestring);
1473         linestring = readfunc (userdata);
1474     }
1475     *initline = NULL;
1476     s_AdjustContigForTerminalNs (contig);
1477     s_CalculateContigOffsets (contig);
1478     return contig;
1479 }
1480 
1481 
1482 /* Used to detect errors in ACE file formatting */
1483 static char s_UnexpectedLineBetweenContigs (char *linestring)
1484 {
1485     if (linestring == NULL) {
1486         return 0;
1487     } else if (linestring [0] == 'A' && linestring [1] == 'F') {
1488         return 1;
1489     } else if (linestring [0] == 'R' && linestring [1] == 'D') {
1490         return 1;
1491     } else {
1492         return 0;
1493     }
1494 }
1495 
1496 
1497 /* This is the main function for reading in an ACE file */
1498 extern TACEFilePtr
1499 ReadACEFile
1500 (FReadLineFunction    readfunc,
1501  void *               userdata,
1502  char                 make_qual_scores,
1503  char *               has_errors)
1504 {
1505     char *              linestring;
1506     TACEFilePtr         afp;
1507     char *              cp;
1508     int                 contig_num = 0, read_num = 0;
1509     int                 num_reads_expected = 0;
1510 
1511     if (readfunc == NULL) {
1512         return NULL;
1513     }
1514 
1515     afp = ACEFileNew ();
1516     if (afp == NULL) {
1517         return NULL;
1518     }
1519   
1520     linestring = readfunc (userdata);
1521 
1522     while (linestring != NULL  &&  linestring [0] != EOF) {
1523         if (linestring [0] == 'A' && linestring [1] == 'S' && isspace (linestring [2])) {
1524             if (num_reads_expected > 0) {
1525                 PrintACEFormatErrorXML ("Two file header lines!", NULL, has_errors);
1526                 ACEFileFree (afp);
1527                 free (linestring);
1528                 return NULL;
1529             }
1530             /* first line in file, number of contigs */
1531             cp = linestring + 3;
1532             afp->num_contigs = atoi (cp);
1533             afp->contigs = malloc (afp->num_contigs * sizeof (TContigPtr));
1534             if (afp->contigs == NULL) {
1535                 PrintACEFormatErrorXML ("Memory allocation failed!", NULL, has_errors);
1536                 free (linestring);
1537                 ACEFileFree (afp);
1538                 return NULL;
1539             }
1540             while (isdigit (*cp)) {
1541                 cp++;
1542             }
1543             num_reads_expected = atoi (cp);
1544             free (linestring);
1545             linestring = readfunc (userdata);
1546         } else if (linestring [0] == 'C' && linestring [1] == 'O' && isspace (linestring [2])) {
1547             if (contig_num >= afp->num_contigs) {
1548                 PrintACEFormatErrorXML ("Too many contigs!", NULL, has_errors);
1549                 free (linestring);
1550                 ACEFileFree (afp);
1551                 return NULL;
1552             }
1553             afp->contigs[contig_num] = s_ReadContig (&linestring, readfunc, userdata, make_qual_scores, has_errors);
1554             if (afp->contigs[contig_num] == NULL) {
1555                 PrintACEFormatErrorXMLStart (NULL, has_errors);
1556                 printf ("Unable to read contig (%d)", contig_num);
1557                 PrintACEFormatErrorXMLEnd ();
1558                 ACEFileFree (afp);
1559                 return NULL;
1560             }
1561             read_num += afp->contigs[contig_num]->num_reads;
1562             contig_num++;
1563         } else if (s_UnexpectedLineBetweenContigs (linestring)) {
1564             PrintACEFormatErrorXMLStart (NULL, has_errors);
1565             printf ("Unexpected line after contig %d:%s", read_num, linestring);
1566             PrintACEFormatErrorXMLEnd ();
1567             free (linestring);
1568             ACEFileFree (afp);
1569             return NULL;
1570         } else {
1571             free (linestring);
1572             linestring = readfunc (userdata);
1573         }
1574     }
1575     if (contig_num < afp->num_contigs) {
1576         PrintACEFormatErrorXML ("Not enough contigs!", NULL, has_errors);
1577         ACEFileFree (afp);
1578         afp = NULL;
1579     } else if (read_num < num_reads_expected) {
1580         PrintACEFormatErrorXML ("Not enough reads!", NULL, has_errors);
1581         ACEFileFree (afp);
1582         afp = NULL;
1583     }
1584 
1585     return afp;
1586 }
1587 
1588 
1589 /* This function writes out sequence characters, 60 per line. */
1590 static void s_WriteSeq (FILE *fp, char *seq)
1591 {
1592     int    i;
1593     char * cp;
1594 
1595     if (fp == NULL || seq == NULL) return;
1596     cp = seq;
1597     while (*cp != 0) {
1598         for (i = 0; i < 60 && *cp != 0; i++, cp++) {
1599             fprintf (fp, "%c", *cp);
1600         }
1601         fprintf (fp, "\n");
1602     }
1603 }
1604 
1605 
1606 /* This function writes out quality scores in the ACE file format. */
1607 static void s_WriteQualScores (FILE *fp, TContigPtr contig)
1608 {
1609     int q_pos, line_pos;
1610 
1611     if (fp == NULL || contig == NULL || contig->num_qual_scores == 0) return;
1612 
1613     fprintf (fp, "BQ\n");
1614     q_pos = 0;
1615     while (q_pos < contig->num_qual_scores) {
1616         line_pos = 0;
1617         while (line_pos < 60 && q_pos < contig->num_qual_scores) {
1618             if (contig->consensus_seq[q_pos] != '*') {
1619                 fprintf (fp, "%d ", contig->qual_scores[q_pos]);
1620                 line_pos++;
1621             }
1622             q_pos++;
1623         }
1624         fprintf (fp, "\n");
1625     }
1626 }
1627 
1628 
1629 /* NOTE - this file does not provide all of the information required for an ACE file. */
1630 static void s_WriteContig (FILE *fp, TContigPtr contig)
1631 {
1632     int i;
1633 
1634     if (contig == NULL) return;
1635 
1636     fprintf (fp, "CO %s %d %d\n\n", contig->consensus_id, contig->consensus_assem_len, contig->num_reads);
1637     s_WriteSeq (fp, contig->consensus_seq);
1638     fprintf (fp, "\n");
1639 
1640     s_WriteQualScores (fp, contig);
1641     fprintf (fp, "\n");
1642 
1643     for (i = 0; i < contig->num_reads; i++) {
1644         fprintf (fp, "AF %s %c %d\n", contig->reads[i]->read_id,
1645                                       contig->reads[i]->is_complement ? 'C' : 'U',
1646                                       contig->reads[i]->cons_start + 1);
1647     }
1648     fprintf (fp, "\n");
1649     for (i = 0; i < contig->num_reads; i++) {
1650         fprintf (fp, "RD %s %d\n", contig->reads[i]->read_id, strlen (contig->reads[i]->read_seq));
1651         s_WriteSeq (fp, contig->reads[i]->read_seq);
1652         fprintf (fp, "\n");
1653     }
1654          
1655 }
1656 
1657 
1658 /* NOTE - This generates an incomplete ACE file - the data structure we currently use
1659  * does not provide enough data to create a complete ACE file.
1660  */
1661 extern void WriteACEFile (FILE *fp, TACEFilePtr afp)
1662 {
1663   int i, tot_reads = 0;
1664   if (fp == NULL || afp == NULL) return;
1665 
1666   for (i = 0; i < afp->num_contigs; i++) {
1667     tot_reads += afp->contigs[i]->num_reads;
1668   }
1669   fprintf (fp, "AS %d %d\n\n", afp->num_contigs, tot_reads);
1670 
1671   for (i = 0; i < afp->num_contigs; i++) {
1672     s_WriteContig (fp, afp->contigs[i]);
1673   }   
1674 }
1675 
1676 
1677 /* This function generates a string that uses the FASTA+GAP method for specifying gaps
1678  * (dashes instead of asterisks)
1679  */
1680 static char * 
1681 s_AlignmentSeqFromContigSeq 
1682 (char *contig_seq,
1683  int   cons_start,
1684  int   aln_len,
1685  int   read_start,
1686  int   read_stop)
1687 {
1688     char * aln_seq;
1689     char * cp;
1690     int  pos = 0, i;
1691 
1692     aln_seq = malloc (sizeof (char) * (aln_len + 1));
1693     /* pad start */
1694     for (i = 0; i < cons_start; i++) {
1695         aln_seq[pos] = '-';
1696         pos++;
1697     }
1698     cp = contig_seq;
1699     if (read_start > 1) {
1700         i = 1;
1701         while (*cp != 0 && i < read_start) {
1702             aln_seq[pos] = '-';
1703             pos++;
1704             cp++;
1705             i++;
1706         }
1707     }
1708     while (*cp != 0 && (read_stop < 1 || i < read_stop)) {
1709         if (*cp == '*') {
1710             aln_seq[pos] = '-';
1711         } else {
1712             aln_seq[pos] = *cp;
1713         }
1714         pos++;
1715         cp++;
1716         i++;
1717     }
1718     while (pos < aln_len) {
1719         aln_seq[pos] = '-';
1720         pos++;
1721     }
1722     aln_seq[pos] = 0;
1723     return aln_seq;
1724 }
1725 
1726 
1727 static char * s_FarPointerIdFromReadId (char * read_id)
1728 {
1729     char * far_id = NULL;
1730     far_id = malloc (sizeof (char) * (strlen (read_id) + 1));
1731     strcpy (far_id, read_id);
1732     return far_id;
1733 }
1734 
1735 
1736 /* This function generates an intermediate data format suitable for generating
1737  * a SeqEntry with an alignment.
1738  */
1739 extern TAlignmentFilePtr AlignmentFileFromContig (TContigPtr contig)
1740 {
1741     TAlignmentFilePtr aln;
1742     int               i, len;
1743     int               consensus_pad = 0, pad, end_pad = 0, aln_len;
1744 
1745     if (contig == NULL) return NULL;
1746 
1747     aln = AlignmentFileNew ();
1748     aln->num_sequences = 1 + contig->num_reads;
1749     aln->num_organisms = 0;
1750     aln->num_deflines = 0;
1751     aln->num_segments = 1;
1752     aln->ids = malloc (sizeof (char *) * aln->num_sequences);
1753     aln->sequences = malloc (sizeof (char *) * aln->num_sequences);
1754     aln->organisms = NULL;
1755     aln->deflines = NULL;
1756     aln->align_format_found = 1;
1757     /* calculate padding for consensus */
1758     for (i = 0; i < contig->num_reads; i++) {
1759         if (contig->reads[i]->cons_start < 0) {
1760             pad = 0 - contig->reads[i]->cons_start;
1761             if (consensus_pad < pad) {
1762                 consensus_pad = pad;
1763             }
1764         }
1765         len = contig->reads[i]->cons_start + strlen (contig->reads[i]->read_seq);
1766         if (len > contig->consensus_assem_len) {
1767             pad = len - contig->consensus_assem_len;
1768             if (pad > end_pad) {
1769                 end_pad = pad;
1770             }
1771         }
1772     }
1773     aln_len = consensus_pad + contig->consensus_assem_len + end_pad;
1774     /* seq for consensus */
1775     len = strlen (contig->consensus_id);
1776     aln->ids[0] = malloc (sizeof (char) * (len + 1));
1777     strcpy (aln->ids[0], contig->consensus_id);
1778     aln->sequences[0] = s_AlignmentSeqFromContigSeq (contig->consensus_seq,
1779                                                      consensus_pad,
1780                                                      aln_len, 0, 0);
1781     for (i = 1; i < aln->num_sequences; i++) {
1782         len = strlen (contig->reads[i - 1]->read_id);
1783         aln->ids[i] = s_FarPointerIdFromReadId (contig->reads[i - 1]->read_id);
1784         aln->sequences[i] = s_AlignmentSeqFromContigSeq (contig->reads[i - 1]->read_seq,
1785                                                          consensus_pad + contig->reads[i - 1]->cons_start,
1786                                                          aln_len,
1787                                                          contig->reads[i - 1]->read_assem_start,
1788                                                          contig->reads[i - 1]->read_assem_stop);
1789     }
1790     return aln;
1791 }
1792 
1793 
1794 /* The Trace Archive Gap String is a list of the number of nucleotides to skip before adding the next gap */
1795 extern char * TraceArchiveGapStringFromACESequence (char *seq_str)
1796 { 
1797     char *cp;
1798     char * gap_str = NULL;
1799     char * print_pos;
1800     int    len = 0, pos;
1801 
1802     if (seq_str == NULL) return NULL;
1803 
1804     /* first determine length of gap string */
1805     cp = seq_str;
1806     while (*cp != 0) {
1807         if (*cp == '*' || *cp == '-') {
1808             len++;
1809         }
1810         cp++;
1811     }
1812     len = 15 * len + 1;
1813     gap_str = malloc (sizeof (char) * len);
1814     cp = seq_str;
1815     print_pos = gap_str;
1816     pos = 0;
1817     while (*cp != 0) {
1818         if (*cp == '*' || *cp == '-') {
1819             sprintf (print_pos, "%d,", pos);
1820             print_pos += strlen (print_pos);
1821             pos = 0;
1822         } else {
1823             pos++;
1824         }
1825         cp++;
1826     }
1827     /* trim final comma */
1828     print_pos[strlen(print_pos) - 1] = 0;
1829     return gap_str;
1830 }
1831 
1832 
1833 /* NOTE - These functions are currently incomplete */
1834 extern void WriteTraceArchiveRead (FILE *fp, TContigReadPtr read)
1835 {
1836     char *cp;
1837     if (fp == NULL || read == NULL) {
1838         return;
1839     }
1840 
1841     fprintf (fp, "<trace>\n");
1842     fprintf (fp, "<trace_name>%s</trace_name>\n", read->read_id);
1843     fprintf (fp, "<traceconsensus>");
1844     cp = read->read_seq;
1845     while (*cp != 0) {
1846         if (*cp != '*') {
1847             fprintf (fp, "%c", *cp);
1848         }
1849         cp++;
1850     }
1851     fprintf (fp, "</traceconsensus>\n"); 
1852     cp = TraceArchiveGapStringFromACESequence (read->read_seq);
1853     fprintf (fp, "<tracegaps>%s</tracegaps>\n", cp);
1854     free (cp);
1855     fprintf (fp, "</trace>\n");
1856 }
1857 
1858 
1859 static int s_GetTokenLen (char *str)
1860 {
1861     char *cp;
1862     int   len = 0;
1863 
1864     if (str == NULL) return 0;
1865 
1866     cp = str;
1867     while (*cp != 0 && !isspace (*cp)) {
1868         len++;
1869         cp++;
1870     }
1871     return len;
1872 }
1873  
1874    
1875 static char * s_SkipTokens (char *str, int num_tokens)
1876 {
1877     char *cp;
1878     int   i;
1879 
1880     if (str == NULL || num_tokens < 0) return NULL;
1881 
1882     cp = str;
1883     /* skip leading whitespace */
1884     while (isspace (*cp)) {
1885         cp++;
1886     }
1887 
1888     for (i = 0; i < num_tokens && *cp != 0; i++) {
1889         /* skip token */
1890         while (*cp != 0 && !isspace (*cp)) {
1891             cp++;
1892         }
1893         /* skip trailing whitespace */
1894         while (isspace (*cp)) {
1895             cp++;
1896         }
1897     }
1898     if (*cp == 0) {
1899         return NULL;
1900     } else {
1901         return cp;
1902     }
1903 }
1904     
1905 
1906 /* for reading other formats */
1907 extern TContigReadPtr 
1908 ReadContigFromString 
1909 (char *str,
1910  char **consensus_id,
1911  int    id_col,
1912  int    seq_col, 
1913  int    contig_id_col,
1914  int    strand_col,
1915  int    start_col,
1916  int    interpret_n_col
1917  )
1918 {
1919     TContigReadPtr read = NULL;
1920     char *cp;
1921     int len, col_num = 1, n_is_gap = 0;
1922     int max_col;
1923 
1924     if (str == NULL) {
1925         return NULL;
1926     }
1927 
1928     max_col = id_col;
1929     if (seq_col > max_col) {
1930       max_col = seq_col;
1931     }
1932     if (contig_id_col > max_col) {
1933       max_col = contig_id_col;
1934     }
1935     if (strand_col > max_col) {
1936       max_col = strand_col;
1937     }
1938     if (start_col > max_col) {
1939       max_col = start_col;
1940     }
1941     if (interpret_n_col > max_col) {
1942       max_col = interpret_n_col;
1943     }
1944 
1945     read = ContigReadNew ();
1946 
1947     cp = str;
1948     len = s_GetTokenLen (cp);
1949     while (cp != NULL && *cp != 0 && col_num <= max_col) {
1950         if (id_col == col_num) {
1951             read->read_id = malloc (len + 1);
1952             strncpy (read->read_id, cp, len);
1953             read->read_id[len] = 0;
1954         } else if (seq_col == col_num) {
1955             read->read_seq = malloc (len + 1);
1956             strncpy (read->read_seq, cp, len);
1957             read->read_seq[len] = 0;
1958             read->read_len = len;
1959         } else if (contig_id_col == col_num) {
1960             if (consensus_id != NULL) {
1961                 *consensus_id = malloc (len + 1);
1962                 strncpy (*consensus_id, cp, len);
1963                 (*consensus_id)[len] = 0;
1964             }
1965         } else if (strand_col == col_num) {
1966             if (*cp == 'R' || *cp == '-') {
1967                 read->is_complement = 1;
1968             }
1969         } else if (start_col == col_num) {
1970             read->cons_start = atoi (cp);
1971         } else if (interpret_n_col == col_num) {
1972             if (*cp == 'I') {
1973                 n_is_gap = 1;
1974             }
1975         }
1976         /* advance to next token */
1977         col_num++;
1978         cp = s_SkipTokens (cp, 1);
1979         len = s_GetTokenLen (cp);
1980     }
1981             
1982     if (max_col > col_num) {
1983         ContigReadFree (read);
1984         read = NULL;
1985     } else {
1986         read->cons_stop = read->cons_start + read->read_len - 1;
1987         read->tiling_start = read->cons_start;
1988         read->tiling_stop = read->cons_stop;
1989         read->read_assem_start = 0;
1990         read->read_assem_stop = read->read_len - 1;
1991         read->read_start = 1;
1992         read->read_stop = read->read_len;
1993         if (n_is_gap) {
1994             /* adjust for gaps */
1995             read->gaps = GapInfoFromSequenceString (read->read_seq, "N");
1996             if (read->gaps->num_gaps > 0) {
1997                 RemoveGapCharsFromSequenceString (read->read_seq, "N");
1998                 read->read_stop -= read->gaps->num_gaps;
1999                 read->read_len -= read->gaps->num_gaps;
2000             }
2001         }
2002     }
2003          
2004     return read;
2005 }
2006 
2007 
2008 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromMAQString (char *str, char **consensus_id)
2009 {
2010     TContigReadPtr read = NULL;
2011 
2012     read = ReadContigFromString (str, consensus_id, 1, 15, 2, 4, 3, 0);
2013     return read;
2014 }
2015 
2016 
2017 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandMostCompressed (char *str, char **consensus_id)
2018 {
2019     TContigReadPtr read = NULL;
2020 
2021     read = ReadContigFromString (str, consensus_id, 0, 1, 0, 5, 4, 0);
2022     return read;
2023 }
2024 
2025 
2026 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandSanger (char *str, char **consensus_id)
2027 {
2028     TContigReadPtr read = NULL;
2029 
2030     read = ReadContigFromString (str, consensus_id, 1, 2, 4, 6, 5, 0);
2031     return read;
2032 }
2033 
2034 
2035 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandStandalone (char *str, char **consensus_id)
2036 {
2037     TContigReadPtr read = NULL;
2038 
2039     read = ReadContigFromString (str, consensus_id, 1, 2, 7, 9, 8, 10);
2040     return read;
2041 }
2042   
2043 
2044 #define READ_BLOCK_SIZE 50
2045 
2046 typedef struct ReadList {
2047   TContigReadPtr reads[READ_BLOCK_SIZE];
2048   int         num_reads;
2049   struct ReadList * next;
2050 } SReadList, * TReadListPtr;
2051 
2052 
2053 static TReadListPtr ReadListNew ()
2054 {
2055     TReadListPtr r;
2056 
2057     r = malloc (sizeof (SReadList));
2058     r->num_reads = 0;
2059     r->next = NULL;
2060     return r;
2061 }
2062 
2063 
2064 static TReadListPtr ReadListFree (TReadListPtr r)
2065 {
2066     TReadListPtr r_next;
2067     int          i;
2068 
2069     while (r != NULL) {
2070         for (i = 0; i < r->num_reads; i++) {
2071             ContigReadFree (r->reads[i]);
2072         }
2073         r_next = r;
2074         free (r);
2075         r = r_next;
2076     }
2077     return r;
2078 }
2079 
2080 
2081 static TReadListPtr AddToReadList (TContigReadPtr read, TReadListPtr read_list)
2082 {
2083     if (read_list == NULL) {
2084         read_list = ReadListNew();
2085     } else {
2086         while (read_list->next != NULL && read_list->num_reads == READ_BLOCK_SIZE) {
2087             read_list = read_list->next;
2088         }
2089         if (read_list->num_reads == READ_BLOCK_SIZE) {
2090             read_list->next = ReadListNew();
2091             read_list = read_list->next;
2092         }
2093     }
2094     read_list->reads[read_list->num_reads++] = read;
2095     return read_list;
2096 }
2097 
2098    
2099 typedef struct ConsensusReads {
2100   TContigPtr   contig;
2101   TReadListPtr read_list;
2102   TReadListPtr last_read;
2103   struct ConsensusReads * next;
2104 } SConsensusReads, * TConsensusReadsPtr;
2105 
2106 
2107 static TConsensusReadsPtr ConsensusReadsNew (char *consensus_id)
2108 {
2109     TConsensusReadsPtr c;
2110 
2111     c = malloc (sizeof (SConsensusReads));
2112     c->contig = ContigNew ();
2113     if (consensus_id != NULL) {
2114         c->contig->consensus_id = malloc (strlen (consensus_id) + 1);
2115         strcpy (c->contig->consensus_id, consensus_id);
2116     }
2117 
2118     c->read_list = NULL;
2119     c->last_read = NULL;
2120     c->next = NULL;
2121     return c;
2122 }
2123 
2124 
2125 static TConsensusReadsPtr ConsensusReadsFree (TConsensusReadsPtr c)
2126 {
2127     TConsensusReadsPtr c_next;
2128 
2129     while (c != NULL) {
2130         c_next = c->next;
2131         ContigFree (c->contig);
2132         c->read_list = ReadListFree (c->read_list);
2133         free (c);
2134         c = c_next;
2135     }
2136     return c;
2137 }
2138 
2139 
2140 static void AddReadToConsensusReads (TConsensusReadsPtr c, TContigReadPtr read)
2141 {
2142     if (c != NULL && read != NULL) {
2143         c->last_read = AddToReadList (read, c->read_list);
2144         if (c->read_list == NULL) {
2145             c->read_list = c->last_read;
2146         }
2147     }
2148 }
2149 
2150 #define CONSENSUS_BLOCK_SIZE 50
2151 
2152 typedef struct ConsensusReadsList {
2153   TConsensusReadsPtr contigs[CONSENSUS_BLOCK_SIZE];
2154   int                num_contigs;
2155   struct ConsensusReadsList * next;
2156 } SConsensusReadsList, * TConsensusReadsListPtr;
2157 
2158 
2159 static TConsensusReadsListPtr ConsensusReadsListNew ()
2160 {
2161     TConsensusReadsListPtr c;
2162 
2163     c = malloc (sizeof (SConsensusReadsList));
2164     c->num_contigs = 0;
2165     c->next = NULL;
2166     return c;
2167 }
2168 
2169 
2170 static TConsensusReadsListPtr ConsensusReadsListFree (TConsensusReadsListPtr c)
2171 {
2172     TConsensusReadsListPtr c_next;
2173     int                    i;
2174 
2175     while (c != NULL) {
2176         c_next = c->next;
2177         for (i = 0; i < c->num_contigs; i++) {
2178              c->contigs[i] = ConsensusReadsFree (c->contigs[i]);
2179         }
2180         free (c);
2181         c = c_next;
2182     }
2183     return c;
2184 }
2185 
2186 
2187 static TConsensusReadsPtr FindConsensusIDInConsensusReadsList (TConsensusReadsListPtr c, char *consensus_id)
2188 {
2189     int i;
2190     TConsensusReadsPtr r = NULL;
2191 
2192     if (consensus_id == NULL) {
2193         return NULL;
2194     }
2195     while (c != NULL && r == NULL)  {
2196         for (i = 0; i < c->num_contigs && r == NULL; i++) {
2197             if (c->contigs[i] != NULL
2198                 && c->contigs[i]->contig != NULL
2199                 && strcmp (c->contigs[i]->contig->consensus_id, consensus_id) == 0) {
2200                 r = c->contigs[i];
2201             }
2202         }
2203         c = c->next;
2204     }
2205     return r;   
2206 }
2207 
2208 
2209 static TConsensusReadsListPtr 
2210 AddConsensusReadToConsensusReadsList 
2211 (TConsensusReadsListPtr c,
2212  char                 * consensus_id,
2213  TContigReadPtr         read)
2214 {
2215     TConsensusReadsPtr r = NULL;
2216 
2217     if (c == NULL) {
2218         c = ConsensusReadsListNew ();
2219         r = ConsensusReadsNew(consensus_id);
2220         c->contigs[c->num_contigs++] = r;
2221     } else {
2222         r = FindConsensusIDInConsensusReadsList (c, consensus_id);
2223         if (r == NULL) {
2224             while (c->next != NULL && c->num_contigs == CONSENSUS_BLOCK_SIZE) {
2225                 c = c->next;
2226             }
2227             if (c->num_contigs == CONSENSUS_BLOCK_SIZE) {
2228                 c->next = ConsensusReadsListNew ();
2229                 c = c->next;
2230             }
2231             r = ConsensusReadsNew(consensus_id);
2232             c->contigs[c->num_contigs++] = r;
2233         }
2234     }
2235     AddReadToConsensusReads (r, read);
2236 
2237     return c;
2238 }
2239 
2240 
2241 static void MoveReadsToContigFromReadList (TContigPtr contig, TReadListPtr read_list)
2242 {
2243     TReadListPtr r;
2244     int          n = 0, i;
2245 
2246     if (contig == NULL) {
2247         return;
2248     }
2249 
2250     for (r = read_list; r != NULL; r = r->next) {
2251         n += r->num_reads;
2252     }
2253 
2254     contig->num_reads = n;
2255     contig->reads = malloc (contig->num_reads * sizeof (TContigReadPtr));
2256     n = 0;
2257 
2258     for (r = read_list; r != NULL; r = r->next) {
2259         for (i = 0; i < r->num_reads; i++) {
2260             contig->reads[n++] = r->reads[i];
2261             r->reads[i] = NULL;
2262         }
2263         r->num_reads = 0;
2264     }
2265 }
2266    
2267 
2268 static TACEFilePtr ACEFileFromConsensusReadsList (TConsensusReadsListPtr contig_list)
2269 {
2270     TACEFilePtr afp = NULL;
2271     TConsensusReadsListPtr c;
2272     int                    i, n = 0;
2273     
2274     if (contig_list == NULL || contig_list->num_contigs == 0) {
2275         return NULL;
2276     }
2277 
2278     afp = ACEFileNew ();
2279     for (c = contig_list; c != NULL; c=c->next) {
2280         afp->num_contigs += c->num_contigs;
2281     }
2282 
2283     afp->contigs = malloc (afp->num_contigs * sizeof (TContigPtr));
2284     for (c = contig_list; c != NULL; c = c->next) {
2285         for (i = 0; i < c->num_contigs; i++) {
2286             MoveReadsToContigFromReadList (c->contigs[i]->contig, c->contigs[i]->read_list);
2287             afp->contigs[n++] = c->contigs[i]->contig;
2288             c->contigs[i]->contig = NULL;
2289             c->contigs[i]->last_read = NULL;
2290         }
2291         c->num_contigs = 0;
2292     }
2293     return afp;
2294 }
2295 
2296 
2297 extern TACEFilePtr ReadAssemblyFile 
2298 (FReadLineFunction    readfunc,      /* function for reading lines of 
2299                                        * alignment file
2300                                        */
2301  void *               fileuserdata,  /* data to be passed back each time
2302                                        * readfunc is invoked
2303                                        */
2304  FReadFromStringFunction makeread_func) /* function to transform a string into a read */
2305 {
2306     TACEFilePtr afp = NULL;
2307     TContigReadPtr read;
2308     TConsensusReadsListPtr contig_list = NULL, contig_last = NULL;
2309     char *linestring;
2310     char *consensus_id = NULL;
2311 
2312     if (readfunc == NULL || makeread_func == NULL) {
2313         return NULL;
2314     }
2315     linestring = readfunc (fileuserdata);
2316 
2317     while (linestring != NULL  &&  linestring [0] != EOF) {
2318         /* get ContigRead */
2319         read = makeread_func (linestring, &consensus_id);
2320         
2321         /* group with other ContigReads from the same consensus_id */
2322         if (read != NULL && consensus_id != NULL) {
2323             contig_last = AddConsensusReadToConsensusReadsList (contig_last, consensus_id, read);
2324             if (contig_list == NULL) {
2325                 contig_list = contig_last;
2326             }
2327             read = NULL;
2328         } 
2329         if (consensus_id != NULL) {
2330             free (consensus_id);
2331             consensus_id = NULL;
2332         }
2333         ContigReadFree (read);
2334         free (linestring);
2335         linestring = readfunc (fileuserdata);
2336     }
2337 
2338     afp = ACEFileFromConsensusReadsList (contig_list);
2339     contig_list = ConsensusReadsListFree (contig_list);
2340     return afp;
2341 }
2342 
2343 
2344 extern TACEFilePtr ReadMAQFile 
2345 (FReadLineFunction    readfunc,      /* function for reading lines of 
2346                                        * alignment file
2347                                        */
2348  void *               fileuserdata)  /* data to be passed back each time
2349                                        * readfunc is invoked
2350                                        */
2351 {
2352     return ReadAssemblyFile (readfunc, fileuserdata, ReadFromMAQString);
2353 }
2354 
2355 
2356 extern TACEFilePtr ReadElandStandaloneFile 
2357 (FReadLineFunction    readfunc,      /* function for reading lines of 
2358                                        * alignment file
2359                                        */
2360  void *               fileuserdata)  /* data to be passed back each time
2361                                        * readfunc is invoked
2362                                        */
2363 {
2364     return ReadAssemblyFile (readfunc, fileuserdata, ReadFromElandStandalone);
2365 }
2366 
2367 
2368 /* functions for writing out XML */
2369 static void WriteTraceGapsXML (TGapInfoPtr gap_info, FILE *fp)
2370 {
2371   int i;
2372 
2373   if (gap_info != NULL && fp != NULL) {
2374     fprintf (fp, "    <ntracegaps>%d</ntracegaps>\n", gap_info->num_gaps);
2375     if (gap_info->num_gaps > 0) {
2376       fprintf (fp, "    <tracegaps source=\"INLINE\">");
2377       for (i = 0; i < gap_info->num_gaps - 1; i++) {
2378         fprintf (fp, "%d ", gap_info->gap_offsets[i]);
2379       }
2380       fprintf (fp, "%d</tracegaps>\n", gap_info->gap_offsets[gap_info->num_gaps - 1]);
2381     } else {
2382       fprintf (fp, "    <tracegaps source=\"INLINE\"> </tracegaps>\n");
2383     }
2384   }
2385 }
2386 
2387 
2388 static void WriteTraceReadXML (TContigReadPtr read, FILE *fp)
2389 {
2390   if (read != NULL && fp != NULL) {
2391     fprintf (fp, "<trace>\n");
2392     if (read->ti > 0) {
2393       fprintf (fp, "  <ti>%d</ti>\n", read->ti);
2394     }
2395     if (read->srr != NULL) {
2396       fprintf (fp, "  <srr>%s</srr>\n", read->srr);
2397     }
2398     if (read->read_id != NULL) {
2399       fprintf (fp, "  <trace_name>%s</trace_name>\n", read->read_id);
2400     }
2401     fprintf (fp, "  <nbasecalls>%d</nbasecalls>\n", read->read_len);
2402     fprintf (fp, "  <valid>\n");
2403     fprintf (fp, "    <start>%d</start>\n", read->read_start);
2404     fprintf (fp, "    <stop>%d</stop>\n", read->read_stop);
2405     fprintf (fp, "  </valid>\n");
2406     fprintf (fp, "  <tiling direction = \"%s\">\n", read->is_complement ? "REVERSE" : "FORWARD");
2407     fprintf (fp, "    <start>%d</start>\n", read->tiling_start);
2408     fprintf (fp, "    <stop>%d</stop>\n", read->tiling_stop);
2409     fprintf (fp, "  </tiling>\n");
2410     fprintf (fp, "  <traceconsensus>\n");
2411     fprintf (fp, "    <start>%d</start>\n", read->cons_start);
2412     fprintf (fp, "    <stop>%d</stop>\n", read->cons_stop);
2413     fprintf (fp, "  </traceconsensus>\n");
2414     WriteTraceGapsXML (read->gaps, fp);
2415     fprintf (fp, "</trace>\n");
2416   }
2417 }
2418 
2419 
2420 extern void WriteTraceAssemblyFromContig (TContigPtr contig, FILE *fp)
2421 {
2422   int i;
2423 
2424   if (contig == NULL || fp == NULL) return;
2425 
2426   /* NOTE - need to add new field to TContigPtr for submitter reference, where orig ID should move to */
2427   fprintf (fp, "  <contig submitter_reference=\"%s\" conformation=\"LINEAR\" type=\"NEW\">\n", 
2428            contig->consensus_id == NULL ? "not supplied" : contig->consensus_id);
2429 
2430   fprintf (fp, "    <ntraces>%d</ntraces>\n", contig->num_reads);
2431 
2432   fprintf (fp, "    <nconbases>%d</nconbases>\n", contig->consensus_seq_len);
2433 
2434   /* need nbasecalls */
2435 
2436   if (contig->gaps == NULL) {
2437     fprintf (fp, "    <ncongaps>0</ncongaps>\n");
2438   } else {
2439     fprintf (fp, "    <ncongaps>%d</ncongaps>\n", contig->gaps->num_gaps);
2440     if (contig->gaps->num_gaps > 0) {
2441       fprintf (fp, "  <congaps source=\"INLINE\">");
2442       for (i = 0; i < contig->gaps->num_gaps - 1; i++) {
2443         fprintf (fp, "%d ", contig->gaps->gap_offsets[i]);
2444       }
2445       fprintf (fp, "%d</congaps>\n", contig->gaps->gap_offsets[contig->gaps->num_gaps - 1]);
2446     }
2447   }
2448   fprintf (fp, "    <consensus>%s</consensus>\n", 
2449            contig->consensus_id == NULL ? "not supplied" : contig->consensus_id);
2450   if (contig->num_qual_scores > 0) {
2451     fprintf (fp, "    <conqualities source=\"INLINE\">");
2452     for (i = 0; i < contig->num_qual_scores; i++) {
2453       fprintf (fp, "%d ", contig->qual_scores[i]);
2454     }
2455     fprintf (fp, "</conqualities>\n");
2456   }
2457   
2458   for (i = 0; i < contig->num_reads; i++) {
2459     WriteTraceReadXML (contig->reads[i], fp);
2460   }
2461   fprintf (fp, "  </contig>\n");
2462 }
2463 
2464 
2465 extern void
2466 WriteTraceAssemblyHeader
2467 (char * assembly_type,
2468  char * subref,
2469  char * center_name,
2470  int    taxid,
2471  char * description,
2472  char * assembly,
2473  int    num_contigs,
2474  unsigned int    num_conbases,
2475  int    num_reads,
2476  unsigned int    num_readbases,
2477  FILE * fp)
2478 {
2479     if (fp == NULL) {
2480         return;
2481     }
2482 
2483     fprintf (fp, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
2484 
2485     fprintf (fp, "<assembly submitter_reference=\"%s\" type = \"%s\">\n", 
2486                  subref == NULL ? "Not supplied" : subref,
2487                  assembly_type == NULL ? "NEW" : assembly_type);
2488     fprintf (fp, "  <center_name>%s</center_name>\n", center_name == NULL ? "Not supplied" : center_name);\
2489     fprintf (fp, "  <organism descriptor=\"TAXID\">%d</organism>\n", taxid);
2490     fprintf (fp, "  <description>%s</description>\n", description == NULL ? "Not supplied" : description);
2491     fprintf (fp, "  <structure>%s</structure>\n", assembly == NULL ? "transcript assembly" : assembly);
2492     fprintf (fp, "  <ncontigs>%d</ncontigs>\n", num_contigs);
2493     fprintf (fp, "  <nconbases>%u</nconbases>\n", num_conbases);
2494     fprintf (fp, "  <ntraces>%d</ntraces>\n", num_reads);
2495     fprintf (fp, "  <nbasecalls>%u</nbasecalls>\n", num_readbases);
2496     fprintf (fp, "  <coverage>%f</coverage>\n", num_conbases == 0 ? 0 : (float) ((float) num_readbases/ (float) num_conbases));
2497 }
2498  
2499 
2500 extern void WriteTraceAssemblyTrailer (FILE *fp)
2501 {
2502     if (fp == NULL) {
2503         return;
2504     }
2505     fprintf (fp, "</assembly>\n");
2506 }
2507 
2508 
2509 extern void 
2510 WriteTraceAssemblyFromAceFile 
2511 (TACEFilePtr afp,
2512  char *      subref,
2513  char *      center_name, 
2514  int         taxid,
2515  char *      description,
2516  FILE        *fp)
2517 { 
2518   int i, j, traces = 0;
2519   unsigned int conbases = 0, basecalls = 0;
2520 
2521   if (afp == NULL || fp == NULL) return;
2522 
2523 
2524   for (i = 0; i < afp->num_contigs; i++) {
2525     conbases += afp->contigs[i]->consensus_seq_len;
2526     traces += afp->contigs[i]->num_reads;
2527     for (j = 0; j < afp->contigs[i]->num_reads; j++) {
2528       basecalls += afp->contigs[i]->reads[j]->read_len;
2529     }
2530   }
2531   WriteTraceAssemblyHeader (NULL, subref, center_name, taxid, description, NULL, afp->num_contigs, conbases, traces, basecalls, fp);
2532 
2533   for (i = 0; i < afp->num_contigs; i++) {
2534     WriteTraceAssemblyFromContig (afp->contigs[i], fp);
2535   }
2536   WriteTraceAssemblyTrailer (fp);
2537 }
2538 
2539 
2540 extern void WriteFASTAFromContig
2541 (TContigPtr contig,
2542  FILE       *fp)
2543 {
2544     int   k;
2545     char *cp;
2546 
2547     if (contig == NULL || fp == NULL) return;
2548     
2549     fprintf (fp, ">%s\n", contig->consensus_id);
2550     cp = contig->consensus_seq;
2551     while (*cp != 0) {
2552         k = 0;
2553         while (k < 40 && *cp != 0) {
2554             if (*cp != '*') {
2555                 fprintf (fp, "%c", *cp);
2556                 k++;
2557             }
2558             cp++;
2559         }
2560         fprintf (fp, "\n");
2561     }
2562     fprintf (fp, "\n");
2563 }
2564 
2565 
2566 extern void
2567 WriteFASTAFromAceFile
2568 (TACEFilePtr afp,
2569  FILE        *fp)
2570 {
2571   int i;
2572 
2573   if (afp == NULL || fp == NULL) return;
2574   
2575   for (i = 0; i < afp->num_contigs; i++) {
2576     WriteFASTAFromContig (afp->contigs[i], fp);
2577   }
2578 }
2579 
2580 
2581 #define kFASTASeqBufSize 100
2582 
2583 typedef struct fastaseqbuf {
2584   char buf[kFASTASeqBufSize];
2585   int  num_used;
2586   struct fastaseqbuf *next;
2587 } SFASTASeqBuf, * TFASTASeqBufPtr;
2588 
2589 
2590 static TFASTASeqBufPtr s_FASTASeqBufNew ()
2591 {
2592     TFASTASeqBufPtr s;
2593 
2594     s = (TFASTASeqBufPtr) malloc (sizeof (SFASTASeqBuf));
2595     if (s != NULL) {
2596         s->num_used = 0;
2597         s->next = NULL;
2598     }
2599     return s;
2600 }
2601 
2602 
2603 static void s_FASTASeqBufFree (TFASTASeqBufPtr s)
2604 {
2605     TFASTASeqBufPtr s_next;
2606 
2607     while (s != NULL) {
2608         s_next = s->next;
2609         free (s);
2610         s = s_next;
2611     }
2612 }
2613 
2614 
2615 static TFASTASeqBufPtr s_AddFASTAToBuf (char *line, TFASTASeqBufPtr buf)
2616 {
2617     TFASTASeqBufPtr last_buf;
2618     char *cp;
2619 
2620     if (buf == NULL) {
2621         buf = s_FASTASeqBufNew();
2622         last_buf = buf;
2623     } else {
2624         last_buf = buf;
2625         while (last_buf->next != NULL) {
2626             last_buf = last_buf->next;
2627         }
2628     }
2629 
2630     cp = line;
2631     while (*cp != 0 && *cp != '\r' && *cp != '\n') {
2632         while (isspace (*cp)) {
2633             cp++;
2634         }
2635         if (*cp != 0) {
2636             if (!isalpha (*cp)) {
2637                 printf ("Found bad character in FASTA file!\n");
2638                 s_FASTASeqBufFree (buf);
2639                 buf = NULL;
2640                 return buf;
2641             }
2642             if (last_buf->num_used == kFASTASeqBufSize) {
2643                 last_buf->next = s_FASTASeqBufNew ();
2644                 last_buf = last_buf->next;
2645             }
2646             last_buf->buf[last_buf->num_used++] = *cp;
2647             cp++;
2648         }
2649     }
2650     return buf;
2651 }
2652 
2653 
2654 static char * s_StripStars (char *str)
2655 {
2656     char *cp_src;
2657     char *cp_dst;
2658     char *stripped;
2659 
2660     if (str == NULL) {
2661         return 0;
2662     }
2663     cp_src = str;
2664     stripped = (char *) malloc (sizeof (char) * (strlen (str) + 1));
2665     cp_dst = stripped;
2666     while (*cp_src != 0) {
2667         if (*cp_src != '*') {
2668             *cp_dst = *cp_src;
2669             cp_dst++;
2670         }
2671         cp_src++;
2672     }
2673     *cp_dst = 0;
2674     return stripped;
2675 }
2676 
2677 
2678 static int s_DoesFASTAMatchSeq (TFASTASeqBufPtr buf, char *trimmed_seq)
2679 {
2680     int does_match = 1, match_len, seq_len;
2681 
2682     if (buf == NULL || trimmed_seq == NULL || *trimmed_seq == 0) {
2683         return 1;
2684     }
2685 
2686     seq_len = strlen (trimmed_seq);
2687     while (buf != NULL && seq_len > 0 && does_match) {
2688         if (seq_len < buf->num_used) {
2689             match_len = seq_len;
2690         } else {
2691             match_len = buf->num_used;
2692         }
2693         if (strncmp (buf->buf, trimmed_seq, match_len) == 0) {
2694             buf = buf->next;
2695             trimmed_seq += match_len;
2696             seq_len -= match_len;
2697         } else {
2698             does_match = 0;
2699         }
2700     }
2701     return does_match;
2702 }
2703 
2704 
2705 static char s_CompLetter (char ch)
2706 {
2707     switch (ch) {
2708         case 'A':
2709             ch = 'T';
2710             break;
2711         case 'T':
2712             ch = 'A';
2713             break;
2714         case 'G':
2715             ch = 'C';
2716             break;
2717         case 'C':
2718             ch = 'G';
2719             break;
2720     }
2721     return ch;
2722 }
2723 
2724 
2725 static void s_RevCompSequence (char *seq)
2726 {
2727     char tmp;
2728     int len, i;
2729 
2730     if (seq == NULL || *seq == 0) {
2731         return;
2732     }
2733     len = strlen (seq);
2734 
2735     for (i = 0; i < len / 2; i++) {
2736         tmp = seq[i];
2737         seq[i] = s_CompLetter (seq[len - i - 1]);
2738         seq[len - i - 1] = s_CompLetter (tmp);
2739     }
2740     if (len %2 > 0) {
2741         seq[i] = s_CompLetter (seq[i]);
2742     }
2743 }
2744 
2745 
2746 static int s_GetSequenceOffset (TFASTASeqBufPtr buf, char *trimmed_seq, char is_complement)
2747 {
2748     int offset = 0, buf_offset;
2749     int match_found = 0;
2750     int match_len, seq_len;
2751 
2752     if (buf == NULL || trimmed_seq == NULL) {
2753         return 0;
2754     }
2755 
2756     trimmed_seq = s_StripStars (trimmed_seq);
2757 
2758     if (is_complement) {
2759         s_RevCompSequence (trimmed_seq);
2760     }
2761 
2762     seq_len = strlen (trimmed_seq);
2763 
2764     while (buf != NULL && !match_found) {
2765         buf_offset = 0;
2766         while (buf_offset < buf->num_used && !match_found) {
2767             if (buf->num_used - buf_offset < seq_len) {
2768                 match_len = buf->num_used - buf_offset;
2769             } else {
2770                 match_len = seq_len;
2771             }
2772             if (match_len < seq_len && buf->next == NULL) {
2773                 /* ran out of sequence, no match */
2774                 buf_offset = buf->num_used;
2775             } else if (strncmp (buf->buf + buf_offset, trimmed_seq, match_len) == 0
2776                 && s_DoesFASTAMatchSeq (buf->next, trimmed_seq + match_len)) {
2777                 match_found = 1;
2778             } else {
2779                 buf_offset++;
2780                 offset++;
2781             }
2782         }
2783         if (!match_found) {
2784             buf = buf->next;
2785         }
2786     }
2787     free (trimmed_seq);
2788     if (match_found) {
2789         return offset;
2790     } else {
2791         return -1;
2792     }
2793 }
2794 
2795 
2796 #define kSeqListBufSize 100
2797 
2798 typedef struct seqlistbuf {
2799   TFASTASeqBufPtr buf[kSeqListBufSize];
2800   char * id_list[kSeqListBufSize];
2801   int  num_used;
2802   struct seqlistbuf *next;
2803 } SSeqListBuf, * TSeqListBufPtr;
2804 
2805 
2806 static TSeqListBufPtr s_SeqListBufNew ()
2807 {
2808     TSeqListBufPtr s;
2809 
2810     s = (TSeqListBufPtr) malloc (sizeof (SSeqListBuf));
2811     if (s != NULL) {
2812         s->num_used = 0;
2813         s->next = NULL;
2814     }
2815     return s;
2816 }
2817 
2818 
2819 static void s_SeqListBufFree (TSeqListBufPtr s)
2820 {
2821     TSeqListBufPtr s_next;
2822     int i;
2823 
2824     while (s != NULL) {
2825         s_next = s->next;
2826         for (i = 0; i < s->num_used; i++) {
2827             s_FASTASeqBufFree (s->buf[i]);
2828             free (s->id_list[i]);
2829             s->buf[i] = NULL;
2830         }
2831         free (s);
2832         s = s_next;
2833     }
2834 }
2835 
2836 
2837 static char * s_GetFASTAIdFromString (char * str)
2838 {
2839     char * cp;
2840     char * id;
2841     int    len;
2842 
2843     if (str == NULL) {
2844         return NULL;
2845     }
2846 
2847     cp = str;
2848     cp += strspn (str, " >\t");
2849     len = strcspn (cp, " \t\r\n");
2850     if (len == 0) {
2851         return NULL;
2852     }
2853     id = (char *)malloc (len + 1);
2854     if (id == NULL) {
2855         return NULL;
2856     }
2857     strncpy (id, cp, len);
2858     id [ len ] = 0;
2859     return id;
2860 }
2861 
2862 
2863 static TSeqListBufPtr s_AddToSeqList (char *line, TSeqListBufPtr buf)
2864 {
2865     TSeqListBufPtr last_buf;
2866 
2867     if (line == NULL) {
2868         return buf;
2869     }
2870     if (buf == NULL) {
2871         buf = s_SeqListBufNew();
2872         last_buf = buf;
2873     } else {
2874         last_buf = buf;
2875         while (last_buf->next != NULL) {
2876             last_buf = last_buf->next;
2877         }
2878     }
2879 
2880     if (*line == '>') {
2881         if (last_buf->num_used == kSeqListBufSize) {
2882             last_buf->next = s_SeqListBufNew();
2883             last_buf = last_buf->next;
2884         }
2885         last_buf->buf[last_buf->num_used] = s_FASTASeqBufNew ();
2886         last_buf->id_list[last_buf->num_used] = s_GetFASTAIdFromString (line);
2887         last_buf->num_used++;
2888     } else if (last_buf->num_used > 0) {
2889         last_buf->buf[last_buf->num_used - 1] = s_AddFASTAToBuf (line, last_buf->buf[last_buf->num_used - 1]);
2890     }
2891         
2892     return buf;
2893 }
2894 
2895 
2896 static TFASTASeqBufPtr s_GetFastaSeq (TSeqListBufPtr buf, char *id)
2897 {
2898     TFASTASeqBufPtr seq = NULL;
2899     char *cp;
2900     int i, match_len;
2901 
2902     if (id == NULL) {
2903         return NULL;
2904     }
2905 
2906     cp = strchr (id, '.');
2907     if (cp == NULL) {
2908         match_len = strlen (id);
2909     } else {
2910         match_len = cp - id;
2911     }
2912 
2913     while (buf != NULL && seq == NULL) {
2914         for (i = 0; i < buf->num_used && seq == NULL; i++) {
2915             if (strncmp (id, buf->id_list[i], match_len) == 0) {
2916                 seq = buf->buf[i];
2917             }
2918         }
2919         buf = buf->next;
2920     }
2921     return seq;
2922 }
2923 
2924 
2925 static TSeqListBufPtr s_ReadFastaFile (FReadLineFunction readfunc, void * userdata)
2926 {
2927     char *linestring;
2928     TSeqListBufPtr fasta = NULL;
2929 
2930     if (readfunc == NULL) {
2931         return NULL;
2932     }
2933     linestring = readfunc (userdata);
2934     while (!s_IsEOF (linestring)) {
2935         fasta = s_AddToSeqList (linestring, fasta);
2936         free (linestring);
2937         linestring = readfunc (userdata);
2938     }
2939     return fasta;
2940 }
2941 
2942 
2943 
2944 
2945 #define kQualScoreBufSize 100
2946 
2947 typedef struct qualscorelist {
2948   int scores[kQualScoreBufSize];
2949   int num_used;
2950   struct qualscorelist *next;
2951 } SQualScoreList, * TQualScoreListPtr;
2952 
2953 
2954 static TQualScoreListPtr s_QualScoreNew ()
2955 {
2956     TQualScoreListPtr s;
2957 
2958     s =(TQualScoreListPtr) malloc (sizeof (SQualScoreList));
2959     if (s != NULL) {
2960         s->num_used = 0;
2961         s->next = NULL;
2962     }
2963     return s;
2964 }
2965 
2966 
2967 static void s_QualScoreFree (TQualScoreListPtr s)
2968 {
2969     TQualScoreListPtr s_next;
2970 
2971     while (s != NULL) {
2972         s_next = s->next;
2973         free (s);
2974         s = s_next;
2975     }
2976 }
2977 
2978 
2979 static TQualScoreListPtr s_AddQualScores (char *line, TQualScoreListPtr scores)
2980 {
2981     TQualScoreListPtr last_score;
2982     char *cp;
2983 
2984     if (scores == NULL) {
2985         scores = s_QualScoreNew();
2986         last_score = scores;
2987     } else {
2988         last_score = scores;
2989         while (last_score->next != NULL) {
2990             last_score = last_score->next;
2991         }
2992     }
2993 
2994     cp = line;
2995     while (*cp != 0 && *cp != '\r' && *cp != '\n') {
2996         while (isspace (*cp)) {
2997             cp++;
2998         }
2999         if (*cp != 0) {
3000             if (!isdigit (*cp)) {
3001                 printf ("Found bad character in quality scores file!\n");
3002                 s_QualScoreFree (scores);
3003                 scores = NULL;
3004                 return scores;
3005             }
3006             if (last_score->num_used == kQualScoreBufSize) {
3007                 last_score->next = s_QualScoreNew ();
3008                 last_score = last_score->next;
3009             }
3010             last_score->scores[last_score->num_used++] = atoi (cp);
3011             while (isdigit (*cp)) {
3012                 cp++;
3013             }
3014         }
3015     }
3016     return scores;
3017 }
3018 
3019 
3020 static int s_AddScoresToRead (TContigReadPtr read, TQualScoreListPtr scores, TSeqListBufPtr fasta)
3021 {
3022     int score_pos = 0;
3023     int offset = 0;
3024     int skip, score_len;
3025     char *cp;
3026     int *dst;
3027     TFASTASeqBufPtr fasta_seq;
3028 
3029     if (read == NULL || scores == NULL) {
3030         return 0;
3031     }
3032 
3033     if (fasta == NULL) {
3034         skip = read->read_start - 1;
3035     } else {
3036         fasta_seq = s_GetFastaSeq (fasta, read->read_id);
3037         if (fasta_seq == NULL) {
3038             printf ("Unable to locate fasta for %s\n", read->read_id);
3039             return 0;
3040         }
3041 
3042         skip = s_GetSequenceOffset (fasta_seq, read->read_seq, read->is_complement);
3043         if (skip < 0) {
3044             printf ("ACE read did not match FASTA read for %s\n", read->read_id);
3045             return 0;
3046         }
3047     }
3048 
3049     /* skip over scores before part used in assembly */
3050     while (scores != NULL && score_pos < skip) {
3051         if (skip - score_pos < scores->num_used) {
3052             offset = skip - score_pos;
3053             score_pos = skip;
3054         } else if (scores->next == NULL) {
3055             printf ("Not enough scores read for %s\n", read->read_id);
3056             return 0;
3057         } else {
3058             score_pos += kQualScoreBufSize;
3059             scores = scores->next;
3060         }
3061     }
3062 
3063     score_len = strlen (read->read_seq);
3064     read->qual_scores = malloc (sizeof (int) * score_len);
3065 
3066     if (read->is_complement) {
3067         /* need to read scores in reverse direction */
3068         cp = read->read_seq + score_len - 1;
3069         dst = read->qual_scores + score_len - 1;
3070         while (scores != NULL && read->num_qual_scores < score_len) {
3071             if (*cp == '*') {
3072                 *dst = 0;
3073             } else {
3074                 *dst = scores->scores[offset];
3075                 offset++;
3076             }
3077             cp--;
3078             dst--;
3079             read->num_qual_scores++;
3080 
3081             if (offset == kQualScoreBufSize) {
3082                 scores = scores->next;
3083                 offset = 0;
3084             }
3085         }
3086     } else {
3087         cp = read->read_seq;
3088         dst = read->qual_scores;
3089         while (scores != NULL && read->num_qual_scores < score_len) {
3090             if (*cp == '*') {
3091                 *dst = 0;
3092             } else {
3093                 *dst = scores->scores[offset];
3094                 offset++;
3095             }
3096             cp++;
3097             dst++;
3098             read->num_qual_scores++;
3099 
3100             if (offset == kQualScoreBufSize) {
3101                 scores = scores->next;
3102                 offset = 0;
3103             }
3104         }
3105     }
3106     if (read->num_qual_scores == score_len) {
3107         return 1;
3108     } else {
3109         printf ("Not enough qual scores for %s\n", read->read_id);
3110         return 0;
3111     }
3112 }
3113 
3114 
3115 static int s_AddQualScoresToReadsInAceFile (TACEFilePtr afp, char *id, TQualScoreListPtr scores, TSeqListBufPtr fasta)
3116 {
3117     int i, j, found = 0, match_len, rval = 1;
3118     char *cp;
3119 
3120     if (afp == NULL || id == NULL) {
3121         return 0;
3122     }
3123 
3124     cp = strchr (id, '.');
3125     if (cp == NULL) {
3126         match_len = strlen (id);
3127     } else {
3128         match_len = cp - id;
3129     }
3130 
3131     for (i = 0; i < afp->num_contigs; i++) {
3132         for (j = 0; j < afp->contigs[i]->num_reads; j++) {
3133             if (strncmp (afp->contigs[i]->reads[j]->read_id, id, match_len) == 0) {
3134                 found = 1;
3135                 rval &= s_AddScoresToRead (afp->contigs[i]->reads[j], scores, fasta);              
3136             }
3137         }
3138     }
3139     if (!found) {
3140         printf ("Unable to locate %s in ACE file\n", id);
3141         rval = 0;
3142     }
3143     return rval;
3144 }
3145 
3146 
3147 extern int
3148 AddReadQualScores
3149 (TACEFilePtr          afp,
3150  FReadLineFunction    readfunc,
3151  void *               userdata,
3152  FReadLineFunction    fasta_readfunc,
3153  void *               fasta_userdata)
3154 {
3155     char *linestring;
3156     char *score_id = NULL;
3157     TQualScoreListPtr scores = NULL;
3158     TSeqListBufPtr    fasta = NULL;
3159     int               rval = 1;
3160 
3161     if (afp == NULL || readfunc == NULL) {
3162         return 0;
3163     }
3164 
3165     if (fasta_readfunc != NULL) {
3166         fasta = s_ReadFastaFile (fasta_readfunc, fasta_userdata);
3167         if (fasta == NULL) {
3168             printf ("Unable to read FASTA file\n");
3169             return 0;
3170         }
3171     }
3172 
3173     linestring = readfunc (userdata);
3174     while (!s_IsEOF (linestring)) {
3175         if (linestring[0] == '>') {            
3176             if (score_id != NULL && scores != NULL) {
3177                 /* add previously read scores to last read */
3178                 if (s_AddQualScoresToReadsInAceFile (afp, score_id, scores, fasta) == 0) {
3179                     printf ("Failed to add quality scores from %s\n", score_id);
3180                     rval = 0;
3181                 }
3182             }
3183             s_QualScoreFree (scores);
3184             scores = NULL;
3185             free (score_id);
3186 
3187             score_id = s_GetFASTAIdFromString (linestring);
3188         } else if (score_id != NULL) {
3189             scores = s_AddQualScores (linestring, scores);
3190         }
3191         free (linestring);
3192         linestring = readfunc (userdata);
3193     }
3194 
3195     /* handle last set of scores read */
3196     if (score_id != NULL && scores != NULL) {
3197         /* add previously read scores to last read */
3198         if (s_AddQualScoresToReadsInAceFile (afp, score_id, scores, fasta) == 0) {
3199             printf ("Failed to add quality scores from %s\n", score_id);
3200             rval = 0;
3201         }
3202     }
3203     s_SeqListBufFree (fasta);
3204     s_QualScoreFree (scores);
3205     scores = NULL;
3206     free (score_id);
3207     return rval;
3208 }
3209 
3210 
3211 static int s_LetterPos (char ch) {
3212     int rval = -1;
3213 
3214     switch (ch) {
3215         case 'A':
3216             rval = 0;
3217             break;
3218         case 'T':
3219             rval = 1;
3220             break;
3221         case 'G':
3222             rval = 2;
3223             break;
3224         case 'C':
3225             rval = 3;
3226             break;
3227         case '*':
3228             rval = 4;
3229             break;
3230     }
3231     return rval;
3232 }
3233 
3234 
3235 static int s_GetUngappedPosition (int gapped_pos, char *seq)
3236 {
3237     int pos = 0, ungapped_pos = 0;
3238     int gaps_found = 0;
3239     char *cp;
3240     
3241     cp = seq;
3242     while (*cp != 0 && pos < gapped_pos ) {
3243         if (*cp == '*') {
3244             gaps_found++;
3245         } else {
3246             ungapped_pos++;
3247         } 
3248         cp++;
3249         pos++;
3250     }
3251     return ungapped_pos;
3252 }
3253 
3254 
3255 static int s_GetQualScoreForReadPos (TContigReadPtr r, int pos)
3256 {
3257     if (r == NULL || pos < 0) {
3258         return 0;
3259     }
3260 
3261     /* note - don't need to get ungapped position because 0s are inserted when qual scores
3262      * are added to the reads.
3263      */
3264     /*pos = s_GetUngappedPosition (pos, r->read_seq); */
3265     if (pos > r->num_qual_scores) {
3266         return 0;
3267     } else {
3268         return r->qual_scores[pos];
3269     }
3270 }
3271 
3272 
3273 extern int ReplaceConsensusSequenceFromTraces (TContigPtr contig, char only_ns)
3274 {
3275     char * consensus_buf;
3276     int  * new_qual_scores = NULL;
3277     int    num_qual_scores = 0;
3278     int    i, k, best, letter_pos;
3279     int    char_counts[5];
3280     char   best_ch, ch;
3281     int    num_best, sum_best;
3282     int  * consensus_qual_ptr = NULL;
3283     int    read_offset, len;
3284     int    num_change = 0;
3285 
3286     if (contig == NULL) {
3287         return 0;
3288     }
3289 
3290     consensus_buf = (char *) malloc (sizeof (char) * (contig->consensus_assem_len + 1));
3291     if (contig->reads[0]->num_qual_scores > 0) {
3292         new_qual_scores = (int *) malloc (sizeof (int) * contig->consensus_seq_len);
3293     }
3294 
3295     consensus_qual_ptr = contig->qual_scores;
3296 
3297     for (i = 0; i < contig->consensus_assem_len; i++) {
3298         if (only_ns && contig->consensus_seq[i] != 'N') {
3299             /* just use existing consensus character */
3300             consensus_buf[i] = contig->consensus_seq[i];
3301             /* add in qual scores */
3302             if (consensus_qual_ptr != NULL && new_qual_scores != NULL && contig->consensus_seq[i] != '*') {
3303                 new_qual_scores[num_qual_scores++] = *consensus_qual_ptr;
3304             }
3305         } else {
3306             for (k = 0; k < 5; k++) {
3307                 char_counts[k] = 0;
3308             }
3309             best = 0;
3310             best_ch = 'N';
3311             for (k = 0; k < contig->num_reads; k++) {
3312                 read_offset = i - contig->reads[k]->cons_start;
3313                 len = strlen (contig->reads[k]->read_seq);
3314                 if (len > read_offset
3315                     && read_offset >= 0) {
3316                     ch = toupper (contig->reads[k]->read_seq[read_offset]);
3317                     letter_pos = s_LetterPos (ch);
3318                     if (letter_pos > -1) {
3319                       char_counts[letter_pos]++;
3320                       if (char_counts[letter_pos] > best
3321                           || (char_counts[letter_pos] == best && best_ch == '*')) {
3322                         best_ch = ch;
3323                         best = char_counts[letter_pos];
3324                       }
3325                     }
3326                 }
3327             }
3328             if (toupper (consensus_buf[i]) != best_ch) {
3329                 num_change++;
3330                 consensus_buf[i] = best_ch;
3331                 if (best_ch != '*') {
3332                     /* calculate quality score */
3333                     if (new_qual_scores != NULL) {
3334                         sum_best = 0;
3335                         num_best = 0;
3336                         for (k = 0; k < contig->num_reads; k++) {
3337                             if (contig->reads[k]->num_qual_scores > i - contig->reads[k]->cons_start
3338                                 &&  best_ch == toupper (contig->reads[k]->read_seq[i - contig->reads[k]->cons_start])) {
3339                                 num_best ++;
3340                                 sum_best += s_GetQualScoreForReadPos (contig->reads[k], i - contig->reads[k]->cons_start);
3341                             }
3342                         }
3343                         if (num_best == 0) {
3344                             new_qual_scores[num_qual_scores++] = 0;
3345                         } else {
3346                             new_qual_scores[num_qual_scores++] = sum_best / num_best;
3347                         }
3348                     }
3349                 }
3350             }
3351         }
3352         if (consensus_qual_ptr != NULL && contig->consensus_seq[i] != '*') {
3353             consensus_qual_ptr++;
3354         }
3355     }
3356     consensus_buf[i] = 0;
3357 
3358     free (contig->consensus_seq);
3359     contig->consensus_seq = consensus_buf;
3360     if (contig->qual_scores != NULL) {
3361         free (contig->qual_scores);
3362     }
3363     contig->qual_scores = new_qual_scores;
3364     contig->num_qual_scores = num_qual_scores;
3365     
3366     return num_change;
3367 }
3368 
3369 
3370 extern void RecalculateConsensusSequences (TACEFilePtr ace_file, char only_ns)
3371 {
3372     int i;
3373 
3374     if (ace_file == NULL) {
3375         return;
3376     }
3377 
3378     for (i = 0; i < ace_file->num_contigs; i++) {
3379         ReplaceConsensusSequenceFromTraces(ace_file->contigs[i], only_ns);
3380     }
3381 
3382 }
3383 
3384 
3385 extern void WriteContigQualScores (TContigPtr contig, FILE *out)
3386 {
3387     int i = 0, j;
3388 
3389     if (contig == NULL || contig->qual_scores == NULL || contig->num_qual_scores < 1 || out == NULL) {
3390         return;
3391     }
3392     fprintf (out, ">%s\n", contig->consensus_id);
3393 
3394     while (i < contig->num_qual_scores) {
3395         for (j = 0; j < 60 && i < contig->num_qual_scores; j++, i++) {
3396             fprintf (out, "%d ", contig->qual_scores[i]);
3397         }
3398         fprintf (out, "\n");
3399     }
3400     fprintf (out, "\n");
3401 }
3402 
3403 
3404 extern char
3405 ProcessLargeACEFileForContigFastaAndQualScores
3406 (FReadLineFunction    readfunc,
3407  void *               userdata,
3408  char                 make_qual_scores,
3409  char *               has_errors,
3410  ProcessContigFunc    process_func,
3411  void *               process_data)
3412 {
3413     char *              linestring;
3414     char *              cp;
3415     int                 contig_num = 0, read_num = 0;
3416     int                 num_reads_expected = 0;
3417     int                 num_contigs = 0;
3418     TContigPtr          contig = NULL;
3419     char                rval = 1;
3420 
3421     if (readfunc == NULL || process_func == NULL) {
3422         return 0;
3423     }
3424 
3425     linestring = readfunc (userdata);
3426 
3427     while (linestring != NULL  &&  linestring [0] != EOF) {
3428         if (linestring [0] == 'A' && linestring [1] == 'S' && isspace (linestring [2])) {
3429             if (num_reads_expected > 0) {
3430                 PrintACEFormatErrorXML ("Two file header lines!", NULL, has_errors);
3431                 return 0;
3432             }
3433             /* first line in file, number of contigs */
3434             cp = linestring + 3;
3435             num_contigs = atoi (cp);
3436             while (isdigit (*cp)) {
3437                 cp++;
3438             }
3439             num_reads_expected = atoi (cp);
3440             linestring = readfunc (userdata);
3441         } else if (linestring [0] == 'C' && linestring [1] == 'O' && isspace (linestring [2])) {
3442             if (contig_num >= num_contigs) {
3443                 PrintACEFormatErrorXML ("Too many contigs!", NULL, has_errors);
3444                 return 0;
3445             }
3446 
3447             contig = s_ReadContig (&linestring, readfunc, userdata, make_qual_scores, has_errors);
3448             if (contig == NULL) {
3449                 PrintACEFormatErrorXMLStart (NULL, has_errors);
3450                 printf ("Unable to read contig (%d)", contig_num);
3451                 PrintACEFormatErrorXMLEnd ();
3452                 return 0;
3453             }
3454             read_num += contig->num_reads;
3455             process_func (contig, process_data);
3456             ContigFree (contig);
3457             contig = NULL;
3458             contig_num++;
3459         } else if (s_UnexpectedLineBetweenContigs (linestring)) {
3460             PrintACEFormatErrorXMLStart (NULL, has_errors);
3461             printf ("Unexpected line after contig %d", read_num);
3462             PrintACEFormatErrorXMLEnd ();
3463             return 0;
3464         } else {
3465             linestring = readfunc (userdata);
3466         }
3467     }
3468     if (contig_num < num_contigs) {
3469         PrintACEFormatErrorXML ("Not enough contigs!", NULL, has_errors);
3470         rval = 0;
3471     } else if (read_num < num_reads_expected) {
3472         PrintACEFormatErrorXML ("Not enough reads!", NULL, has_errors);
3473         rval = 0;
3474     }
3475 
3476     return rval;
3477 }
3478 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.