|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/api/aceread.c |
source navigation diff markup identifier search freetext search file search |
1 /*
2 * $Id: aceread.c,v 1.16 2008/12/22 22:40:30 bollin Exp $
3 *
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Authors: Colleen Bollin
29 *
30 */
31
32
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <ctype.h>
37 #include <util/creaders/alnread.h>
38 #include <aceread.h>
39
40
41 typedef enum {
42 eTrue = -1,
43 eFalse = 0
44 } EBool;
45
46
47 typedef enum {
48 eJustRight = 0,
49 eNone,
50 eTooMany,
51 eTooFew,
52 eUnexpected
53 } EFound;
54
55
56 extern void PrintACEFormatErrorXMLStart (char *id, char *has_errors)
57 {
58 if (has_errors != NULL) {
59 if (*has_errors == 0) {
60 printf ("<aceread>\n");
61 *has_errors = 1;
62 }
63 }
64 printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">", id == NULL ? "No ID" : id);
65 }
66
67
68 extern void PrintACEFormatErrorXMLEnd (void)
69 {
70 printf ("</message>\n");
71 }
72
73
74 extern void PrintACEFormatErrorXML (char *msg, char *id, char *has_errors)
75 {
76 if (has_errors != NULL) {
77 if (*has_errors == 0) {
78 printf ("<aceread>\n");
79 *has_errors = 1;
80 }
81 }
82 printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">%s</message>\n", id == NULL ? "No ID" : id, msg);
83 }
84
85
86 static void s_ReportFound (EFound val, char *label, char *id, char *has_errors)
87 {
88 switch (val) {
89 case eNone:
90 PrintACEFormatErrorXMLStart (id, has_errors);
91 printf ("Found no %s", label);
92 PrintACEFormatErrorXMLEnd ();
93 break;
94 case eTooMany:
95 PrintACEFormatErrorXMLStart (id, has_errors);
96 printf ("Too many %s", label);
97 PrintACEFormatErrorXMLEnd ();
98 break;
99 case eTooFew:
100 PrintACEFormatErrorXMLStart (id, has_errors);
101 printf ("Too few %s", label);
102 PrintACEFormatErrorXMLEnd ();
103 break;
104 case eUnexpected:
105 PrintACEFormatErrorXMLStart (id, has_errors);
106 printf ("Unexpected character while reading %s", label);
107 PrintACEFormatErrorXMLEnd ();
108 break;
109 case eJustRight:
110 break;
111 default:
112 PrintACEFormatErrorXML ("Unknown error", id, has_errors);
113 break;
114 }
115 }
116
117
118 extern TGapInfoPtr GapInfoNew (void)
119 {
120 TGapInfoPtr g;
121
122 g = (TGapInfoPtr) malloc (sizeof (SGapInfo));
123 if (g != NULL) {
124 g->num_gaps = 0;
125 g->gap_offsets = NULL;
126 }
127 return g;
128 }
129
130
131 extern void GapInfoFree (TGapInfoPtr g)
132 {
133 if (g != NULL) {
134 free (g->gap_offsets);
135 free (g);
136 }
137 }
138
139
140 static int s_IsGapChar (char ch, char *gap_chars)
141 {
142 if (ch == 0 || gap_chars == NULL) {
143 return 0;
144 }
145 while (*gap_chars != 0 && *gap_chars != ch) {
146 gap_chars ++;
147 }
148 if (*gap_chars == ch) {
149 return 1;
150 } else {
151 return 0;
152 }
153 }
154
155
156 /* The Trace Archive Gap String is a list of the number of nucleotides to skip before adding the next gap */
157 extern TGapInfoPtr GapInfoFromSequenceString (char *seq_str, char *gap_chars)
158 {
159 char * cp;
160 int num_gaps = 0, pos, gap_num = 0;
161 TGapInfoPtr g = NULL;
162
163 if (seq_str == NULL) return NULL;
164
165 /* first determine number of gaps */
166 cp = seq_str;
167 while (*cp != 0) {
168 if (s_IsGapChar(*cp, gap_chars)) {
169 num_gaps++;
170 }
171 cp++;
172 }
173
174 g = GapInfoNew ();
175 if (num_gaps > 0) {
176 g->num_gaps = num_gaps;
177 g->gap_offsets = malloc (g->num_gaps * sizeof (int));
178 cp = seq_str;
179 pos = 0;
180 while (*cp != 0) {
181 if (s_IsGapChar(*cp, gap_chars)) {
182 g->gap_offsets[gap_num] = pos;
183 gap_num++;
184 pos = 0;
185 } else {
186 pos++;
187 }
188 cp++;
189 }
190 }
191 return g;
192 }
193
194 extern void RemoveGapCharsFromSequenceString (char *seq_str, char *gap_chars)
195 {
196 char *cp_src, *cp_dst;
197
198 if (seq_str == NULL || gap_chars == NULL) {
199 return;
200 }
201
202 cp_src = seq_str;
203 cp_dst = seq_str;
204 while (*cp_src != 0) {
205 if (!s_IsGapChar(*cp_src, gap_chars)) {
206 *cp_dst = *cp_src;
207 cp_dst++;
208 }
209 cp_src++;
210 }
211 }
212
213
214 /* calculate sequence position from tiling position (both values zero-based) given gap_info */
215 extern int SeqPosFromTilingPos (int tiling_pos, TGapInfoPtr gap_info)
216 {
217 int pos = 0, seq_pos = 0, gap_num = 0;
218
219 if (tiling_pos < 0 || gap_info == NULL || gap_info->num_gaps == 0) {
220 return tiling_pos;
221 }
222
223 while (gap_num < gap_info->num_gaps && pos + gap_info->gap_offsets[gap_num] <= tiling_pos) {
224 seq_pos += gap_info->gap_offsets[gap_num];
225 pos += gap_info->gap_offsets[gap_num] + 1;
226 gap_num++;
227 }
228 seq_pos += tiling_pos - pos;
229 return seq_pos;
230 }
231
232
233 /* calculate sequence position from tiling position (both values zero-based) given gap_info */
234 extern int TilingPosFromSeqPos (int seq_pos, TGapInfoPtr gap_info)
235 {
236 int pos = 0, tiling_pos = 0, gap_num = 0;
237
238 if (seq_pos < 0 || gap_info == NULL || gap_info->num_gaps == 0) {
239 return seq_pos;
240 }
241
242 while (gap_num < gap_info->num_gaps && pos + gap_info->gap_offsets[gap_num] <= seq_pos) {
243 pos += gap_info->gap_offsets[gap_num];
244 tiling_pos += gap_info->gap_offsets[gap_num] + 1;
245 gap_num++;
246 }
247 tiling_pos += seq_pos - pos;
248 return tiling_pos;
249 }
250
251
252 /* adjust gap info when sequence is trimmed */
253 static void AdjustGapInfoFor5Trim (TGapInfoPtr gap_info, int trim)
254 {
255 int pos = 0;
256 int num_gaps = 0;
257 int i;
258
259 if (gap_info == NULL || gap_info->num_gaps < 1 || trim < 1) {
260 return;
261 }
262
263 while (num_gaps < gap_info->num_gaps && pos + gap_info->gap_offsets[num_gaps] < trim) {
264 pos += gap_info->gap_offsets[num_gaps];
265 num_gaps++;
266 }
267 if (num_gaps < gap_info->num_gaps) {
268 gap_info->gap_offsets[num_gaps] -= trim - pos;
269 for (i = num_gaps; i < gap_info->num_gaps; i++) {
270 gap_info->gap_offsets[i - num_gaps] = gap_info->gap_offsets[i];
271 }
272 gap_info->num_gaps -= num_gaps;
273 } else {
274 free (gap_info->gap_offsets);
275 gap_info->gap_offsets = NULL;
276 gap_info->num_gaps = 0;
277 }
278
279 }
280
281
282 static void AdjustGapInfoFor3Trim (TGapInfoPtr gap_info, int new_len)
283 {
284 int pos = 0;
285 int num_gaps = 0;
286
287 if (gap_info == NULL || gap_info->num_gaps < 1) {
288 return;
289 }
290
291 while (num_gaps < gap_info->num_gaps && pos + gap_info->gap_offsets[num_gaps] < new_len) {
292 pos += gap_info->gap_offsets[num_gaps];
293 num_gaps++;
294 }
295 if (num_gaps < gap_info->num_gaps) {
296 gap_info->num_gaps = num_gaps;
297 }
298 }
299
300 /* TODO: NEED TO write function for truncating on right, test function for truncating on left */
301
302 extern TContigReadPtr ContigReadNew (void)
303 {
304 TContigReadPtr r;
305
306 r = (TContigReadPtr) malloc (sizeof (SContigRead));
307 if (r == NULL) {
308 return NULL;
309 }
310 r->read_id = NULL;
311 r->ti = 0;
312 r->srr = NULL;
313 r->read_seq = NULL;
314 r->is_complement = 0;
315 r->cons_start = 0;
316 r->cons_stop = 0;
317 r->gaps = NULL;
318 r->local = 1;
319 r->valid = 0;
320 r->qual_scores = NULL;
321 r->num_qual_scores = 0;
322 r->tag = NULL;
323 return r;
324 }
325
326
327 extern void ContigReadFree (TContigReadPtr r)
328 {
329 if (r != NULL) {
330 if (r->read_id != NULL) {
331 free (r->read_id);
332 }
333 if (r->srr != NULL) {
334 free (r->srr);
335 }
336 if (r->read_seq != NULL) {
337 free (r->read_seq);
338 }
339 if (r->gaps != NULL) {
340 GapInfoFree (r->gaps);
341 }
342 if (r->qual_scores != NULL) {
343 free (r->qual_scores);
344 }
345 if (r->tag != NULL) {
346 free (r->tag);
347 }
348 free (r);
349 }
350 }
351
352
353 extern TBaseSegPtr BaseSegNew (void)
354 {
355 TBaseSegPtr b;
356
357 b = (TBaseSegPtr) malloc (sizeof (SBaseSeg));
358 if (b == NULL) {
359 return NULL;
360 }
361 b->read_id = NULL;
362 b->cons_start = 0;
363 b->cons_stop = 0;
364 return b;
365 }
366
367
368 extern void BaseSegFree (TBaseSegPtr b)
369 {
370 if (b != NULL) {
371 if (b->read_id != NULL) {
372 free (b->read_id);
373 }
374 free (b);
375 }
376 }
377
378
379 /* reads a correctly formatted line and creates a base seg.
380 */
381 static TBaseSegPtr s_ReadBaseSeg (char *line)
382 {
383 TBaseSegPtr base_seg = NULL;
384 char *cp;
385 int start, stop, len;
386
387 if (line == NULL || *line != 'B' || *(line + 1) != 'S') {
388 return NULL;
389 }
390
391
392 cp = line + 2;
393 while (isspace (*cp)) {
394 cp++;
395 }
396 if (!isdigit (*cp)) {
397 return NULL;
398 }
399 start = atoi (cp);
400 while (isdigit (*cp)) {
401 cp++;
402 }
403 while (isspace (*cp)) {
404 cp++;
405 }
406 if (!isdigit (*cp)) {
407 return NULL;
408 }
409 stop = atoi (cp);
410 while (isdigit (*cp)) {
411 cp++;
412 }
413 while (isspace (*cp)) {
414 cp++;
415 }
416 if (*cp == 0) {
417 return NULL;
418 }
419
420 len = strlen (cp);
421
422 base_seg = BaseSegNew ();
423 base_seg->cons_start = start;
424 base_seg->cons_stop = stop;
425 base_seg->read_id = malloc (sizeof (char) * len + 1);
426 strcpy (base_seg->read_id, cp);
427
428 return base_seg;
429 }
430
431
432 extern TConsensusReadAlnPtr ConsensusReadAlnNew (int numseg)
433 {
434 TConsensusReadAlnPtr a;
435 int i;
436
437 a = (TConsensusReadAlnPtr) malloc (sizeof (SConsensusReadAln));
438 a->is_complement = 0;
439 if (numseg < 1) {
440 a->lens = NULL;
441 a->cons_starts = NULL;
442 a->read_starts = NULL;
443 a->numseg = 0;
444 } else {
445 a->lens = (int *) malloc (sizeof (int) * numseg);
446 a->cons_starts = (int *) malloc (sizeof (int) * numseg);
447 a->read_starts = (int *) malloc (sizeof (int) * numseg);
448 for (i = 0; i < numseg; i++) {
449 a->lens[i] = 0;
450 a->cons_starts[i] = 0;
451 a->read_starts[0] = 0;
452 }
453 a->numseg = numseg;
454 }
455 return a;
456 }
457
458
459 extern TConsensusReadAlnPtr ConsensusReadAlnFree (TConsensusReadAlnPtr a)
460 {
461 if (a != NULL) {
462 if (a->lens != NULL) {
463 free (a->lens);
464 a->lens = NULL;
465 }
466 if (a->cons_starts != NULL) {
467 free (a->cons_starts);
468 a->cons_starts = NULL;
469 }
470 if (a->read_starts != NULL) {
471 free (a->read_starts);
472 a->read_starts = NULL;
473 }
474 free (a);
475 a = NULL;
476 }
477 return a;
478 }
479
480
481 extern TConsensusReadAlnPtr GetConsensusReadAln (char *consensus_seq, TContigReadPtr read)
482 {
483 TConsensusReadAlnPtr aln = NULL;
484 char *c;
485 char *c_start;
486 char *r;
487 char *r_start;
488 int numseg = 0, aln_len, pos, seg, con_offset = 0, read_offset = 0;
489 char con_gap_open = 0, read_gap_open = 0, gap_change;
490
491 if (consensus_seq == NULL || read == NULL) {
492 return NULL;
493 }
494
495 if (read->cons_start > 0) {
496 c_start = consensus_seq + read->cons_start;
497 r_start = read->read_seq + read->read_assem_start - 1;
498 } else {
499 c_start = consensus_seq;
500 r_start = read->read_seq + read->read_assem_start - 1;
501 }
502
503 aln_len = read->cons_stop - read->cons_start + 1;
504 while (*c_start == '*' && *r_start == '*') {
505 c_start++;
506 r_start++;
507 aln_len--;
508 }
509
510 /* first, count number of segments needed */
511 c = c_start;
512 r = r_start;
513 if (*c != '*' && *r != '*') {
514 numseg++;
515 }
516 pos = 0;
517 while (*c != 0 && *r != 0 && pos < aln_len) {
518 if (*c == '*' && *r == '*') {
519 /* both in gap - ignore */
520 } else {
521 gap_change = 0;
522 if (*c == '*') {
523 if (!con_gap_open) {
524 gap_change = 1;
525 con_gap_open = 1;
526 }
527 } else {
528 if (con_gap_open) {
529 gap_change = 1;
530 con_gap_open = 0;
531 }
532 }
533 if (*r == '*') {
534 if (!read_gap_open) {
535 gap_change = 1;
536 read_gap_open = 1;
537 }
538 } else {
539 if (read_gap_open) {
540 gap_change = 1;
541 read_gap_open = 0;
542 }
543 }
544 if (gap_change) {
545 numseg++;
546 }
547 }
548 c++;
549 r++;
550 pos++;
551 }
552
553 /* create alignment */
554 aln = ConsensusReadAlnNew (numseg);
555 pos = 0;
556 seg = 0;
557
558
559 c = consensus_seq;
560 while (c < c_start) {
561 if (*c != '*') {
562 con_offset ++;
563 }
564 c++;
565 }
566
567 r = read->read_seq;
568 while (r < r_start) {
569 if (*r != '*') {
570 read_offset ++;
571 }
572 r++;
573 }
574
575
576 if (*c_start == '*') {
577 aln->cons_starts[0] = -1;
578 con_gap_open = 1;
579 } else {
580 aln->cons_starts[0] = con_offset;
581 con_gap_open = 0;
582 }
583
584 if (*r_start == '*') {
585 aln->read_starts[0] = -1;
586 read_gap_open = 1;
587 } else {
588 aln->read_starts[0] = read_offset;
589 read_gap_open = 0;
590 }
591
592 c = c_start + 1;
593 r = r_start + 1;
594 aln->lens[0] = 1;
595 pos = 1;
596
597 while (*c != 0 && *r != 0 && pos < aln_len) {
598 if (*c == '*' && *r == '*') {
599 /* both in gap - ignore */
600 } else {
601 gap_change = 0;
602 if (*c == '*') {
603 if (!con_gap_open) {
604 gap_change = 1;
605 con_gap_open = 1;
606 }
607 } else {
608 if (con_gap_open) {
609 gap_change = 1;
610 con_gap_open = 0;
611 }
612 }
613 if (*r == '*') {
614 if (!read_gap_open) {
615 gap_change = 1;
616 read_gap_open = 1;
617 }
618 } else {
619 if (read_gap_open) {
620 gap_change = 1;
621 read_gap_open = 0;
622 }
623 }
624 if (gap_change) {
625 seg++;
626 if (con_gap_open) {
627 aln->cons_starts[seg] = -1;
628 } else if (aln->cons_starts[seg - 1] > -1) {
629 aln->cons_starts[seg] = aln->cons_starts[seg - 1] + aln->lens[seg - 1];
630 } else if (seg > 1 && aln->cons_starts[seg - 2] > -1) {
631 aln->cons_starts[seg] = aln->cons_starts[seg - 2] + aln->lens[seg - 2];
632 } else {
633 aln->cons_starts[seg] = con_offset;
634 }
635 if (read_gap_open) {
636 aln->read_starts[seg] = -1;
637 } else if (aln->read_starts[seg - 1] > -1) {
638 aln->read_starts[seg] = aln->read_starts[seg - 1] + aln->lens[seg - 1];
639 } else if (seg > 1 && aln->read_starts[seg - 2] > -1) {
640 aln->read_starts[seg] = aln->read_starts[seg - 2] + aln->lens[seg - 2];
641 } else {
642 aln->read_starts[seg] = read_offset;
643 }
644 }
645 aln->lens[seg]++;
646 }
647 c++;
648 r++;
649 pos++;
650 }
651
652 /* todo - adjust starts for complement */
653 if (read->is_complement) {
654 for (seg = 0; seg < aln->numseg; seg++) {
655 if (aln->read_starts[seg] > -1) {
656 aln->read_starts[seg] = read->read_len - aln->read_starts[seg] - aln->lens[seg];
657 }
658 }
659 aln->is_complement = 1;
660 }
661
662 return aln;
663 }
664
665
666 extern TContigPtr ContigNew (void)
667 {
668 TContigPtr c;
669
670 c = (TContigPtr) malloc (sizeof (SContig));
671 if (c == NULL) {
672 return NULL;
673 }
674 c->consensus_id = NULL;
675 c->consensus_seq = NULL;
676 c->consensus_assem_len = 0;
677 c->consensus_seq_len = 0;
678 c->is_complement = 0;
679 c->num_qual_scores = 0;
680 c->qual_scores = NULL;
681 c->num_reads = 0;
682 c->reads = NULL;
683 c->gaps = NULL;
684 c->num_reads = 0;
685 c->reads = NULL;
686 c->num_base_segs = 0;
687 c->base_segs = NULL;
688 c->tag = NULL;
689
690 return c;
691 }
692
693
694 extern void ContigFree (TContigPtr c)
695 {
696 int i;
697
698 if (c != NULL) {
699 if (c->consensus_id != NULL) free (c->consensus_id);
700 if (c->consensus_seq != NULL) free (c->consensus_seq);
701 if (c->qual_scores != NULL) free (c->qual_scores);
702
703 if (c->reads != NULL) {
704 for (i = 0; i < c->num_reads; i++) {
705 if (c->reads[i] != NULL) {
706 ContigReadFree (c->reads[i]);
707 }
708 }
709 free (c->reads);
710 }
711 if (c->base_segs != NULL) {
712 for (i = 0; i < c->num_base_segs; i++) {
713 if (c->base_segs[i] != NULL) {
714 BaseSegFree (c->base_segs[i]);
715 }
716 }
717 free (c->base_segs);
718 }
719 if (c->tag != NULL) {
720 free (c->tag);
721 }
722 free (c);
723 }
724 }
725
726
727 extern TACEFilePtr ACEFileNew ()
728 {
729 TACEFilePtr afp;
730
731 afp = (TACEFilePtr) malloc (sizeof (SACEFile));
732 if (afp == NULL) {
733 return NULL;
734 }
735 afp->num_contigs = 0;
736 afp->contigs = NULL;
737
738 return afp;
739 }
740
741
742 extern void ACEFileFree (TACEFilePtr afp)
743 {
744 int i;
745
746 if (afp != NULL) {
747 for (i = 0; i < afp->num_contigs; i++) {
748 ContigFree (afp->contigs[i]);
749 }
750 free (afp->contigs);
751 free (afp);
752 }
753 }
754
755
756 static char s_IsSeqChar (char ch)
757 {
758 if (ch == '*' || isalpha (ch)) {
759 return 1;
760 } else {
761 return 0;
762 }
763 }
764
765
766 static char s_IsEOF (char *linestring)
767 {
768 if (linestring == NULL || linestring [0] == EOF) {
769 return 1;
770 } else {
771 return 0;
772 }
773 }
774
775
776 static char *
777 s_ReadSequenceFromFile
778 (int len,
779 FReadLineFunction readfunc,
780 void * userdata,
781 char * id,
782 char * has_errors)
783 {
784 char *seq;
785 char *linestring;
786 char *cp;
787 int pos = 0;
788
789 /* copy in sequence data */
790 seq = malloc (len + 1);
791 linestring = readfunc (userdata);
792 while (!s_IsEOF (linestring) && s_IsSeqChar (linestring [0])) {
793 /* append to consensus */
794 cp = linestring;
795 while (s_IsSeqChar (*cp) && pos < len) {
796 if (isalpha (*cp)) {
797 seq [pos] = toupper (*cp);
798 } else {
799 seq [pos] = *cp;
800 }
801 pos++;
802 cp++;
803 }
804 if (s_IsSeqChar (*cp)) {
805 PrintACEFormatErrorXML ("Too many sequence characters!", id, has_errors);
806 free (seq);
807 return NULL;
808 }
809 free (linestring);
810 linestring = readfunc (userdata);
811 }
812 free (linestring);
813 if (pos < len) {
814 PrintACEFormatErrorXML ("Too few sequence characters!", id, has_errors);
815 free (seq);
816 seq = NULL;
817 } else {
818 seq[pos] = 0;
819 }
820 return seq;
821 }
822
823
824 static char s_LineIsEmptyButNotEof (char *linestring)
825 {
826 char *cp;
827 if (s_IsEOF (linestring)) {
828 return 0;
829 }
830
831 cp = linestring;
832 while (*cp != 0 && isspace (*cp)) {
833 cp++;
834 }
835 if (*cp == 0) {
836 return 1;
837 } else {
838 return 0;
839 }
840 }
841
842
843 static void s_SkipQualScores
844 (FReadLineFunction readfunc,
845 void * userdata)
846 {
847 char * linestring;
848 char * cp;
849 if (readfunc == NULL) return;
850
851 linestring = readfunc (userdata);
852 while (s_LineIsEmptyButNotEof (linestring)) {
853 free (linestring);
854 linestring = readfunc (userdata);
855 }
856 if (linestring == NULL || linestring [0] == EOF || strcmp (linestring, "BQ") != 0) {
857 return;
858 }
859 linestring = readfunc (userdata);
860 while (!s_IsEOF (linestring)
861 && isdigit (*(cp = linestring + strspn (linestring, " \t")))) {
862 free (linestring);
863 linestring = readfunc (userdata);
864 }
865 free (linestring);
866 }
867
868
869 static EFound s_ReadQualScores
870 (TContigPtr contig,
871 FReadLineFunction readfunc,
872 void * userdata)
873 {
874 char * linestring;
875 char * cp;
876 int pos;
877
878 if (contig == NULL || readfunc == NULL || contig->consensus_assem_len == 0) {
879 return eNone;
880 }
881
882 linestring = readfunc (userdata);
883 while (s_LineIsEmptyButNotEof (linestring)) {
884 free (linestring);
885 linestring = readfunc (userdata);
886 }
887 if (linestring == NULL || linestring [0] == EOF || strcmp (linestring, "BQ") != 0) {
888 return eNone;
889 }
890
891 /* read quality scores */
892 contig->num_qual_scores = contig->consensus_assem_len;
893 /* no score for * in consensus seq */
894 for (pos = 0; pos < contig->consensus_assem_len; pos++) {
895 if (contig->consensus_seq[pos] == '*') {
896 contig->num_qual_scores --;
897 }
898 }
899 contig->qual_scores = malloc (sizeof (int) * contig->num_qual_scores);
900 pos = 0;
901 linestring = readfunc (userdata);
902 while (!s_IsEOF (linestring)
903 && isdigit (*(cp = linestring + strspn (linestring, " \t")))) {
904 while (isdigit (*cp) && pos < contig->num_qual_scores) {
905 contig->qual_scores [pos] = atoi (cp);
906 pos++;
907 while (isdigit (*cp)) {
908 cp++;
909 }
910 while (isspace (*cp)) {
911 cp++;
912 }
913 }
914 if (isdigit (*cp)) {
915 return eTooMany;
916 }
917 free (linestring);
918 linestring = readfunc (userdata);
919 }
920 if (pos < contig->num_qual_scores) {
921 return eTooFew;
922 } else {
923 return eJustRight;
924 }
925 }
926
927
928 static EFound s_ReadAFLines
929 (TContigPtr contig,
930 FReadLineFunction readfunc,
931 void * userdata,
932 char ** next_line)
933 {
934 char * linestring;
935 char * cp;
936 int read_num, len;
937 EFound rval = eJustRight;
938
939 if (contig == NULL || readfunc == NULL || contig->num_reads == 0) return eNone;
940
941 /* get AF lines */
942 contig->reads = malloc (contig->num_reads * sizeof (TContigReadPtr));
943 linestring = readfunc (userdata);
944 while (s_LineIsEmptyButNotEof (linestring)) {
945 free (linestring);
946 linestring = readfunc (userdata);
947 }
948 if (linestring == NULL || linestring [0] == EOF || strncmp (linestring, "AF", 2) != 0) {
949 *next_line = linestring;
950 return eNone;
951 }
952
953 read_num = 0;
954 while (!s_IsEOF(linestring) && read_num < contig->num_reads
955 && linestring [0] == 'A' && linestring [1] == 'F' && isspace (linestring [2])) {
956 contig->reads[read_num] = ContigReadNew ();
957 len = strlen (linestring + 3);
958 contig->reads[read_num]->read_id = malloc (len + 1);
959 strcpy (contig->reads[read_num]->read_id, linestring + 3);
960 cp = contig->reads[read_num]->read_id;
961 while (*cp != 0 && !isspace (*cp)) {
962 cp++;
963 }
964 if (isspace (*cp)) {
965 *cp = 0;
966 cp++;
967 }
968 if (*cp == 'C') {
969 contig->reads[read_num]->is_complement = 1;
970 } else if (*cp != 'U') {
971 *next_line = linestring;
972 return eUnexpected;
973 }
974 cp++;
975 if (isspace (*cp)) {
976 cp++;
977 }
978 contig->reads[read_num]->cons_start = atoi (cp) - 1;
979 read_num++;
980 free (linestring);
981 linestring = readfunc (userdata);
982 }
983 if (read_num < contig->num_reads) {
984 rval = eTooFew;
985 } else if (!s_IsEOF(linestring) && strncmp (linestring, "AF ", 3) == 0) {
986 rval = eTooMany;
987 } else {
988 rval = eJustRight;
989 }
990 *next_line = linestring;
991 return rval;
992 }
993
994
995 static EFound s_ReadBaseSegs
996 (TContigPtr contig,
997 int num_base_segs,
998 char * firstline,
999 FReadLineFunction readfunc,
1000 void * userdata)
1001 {
1002 char * linestring;
1003
1004 if (contig == NULL || readfunc == NULL || num_base_segs == 0) return eNone;
1005
1006 contig->base_segs = malloc (sizeof (TBaseSegPtr) * num_base_segs);
1007 contig->num_base_segs = 0;
1008
1009 /* get BS lines */
1010 linestring = firstline;
1011 while (s_LineIsEmptyButNotEof (linestring)) {
1012 free (linestring);
1013 linestring = readfunc (userdata);
1014 }
1015 if (linestring == NULL || linestring [0] == EOF || strncmp (linestring, "BS", 2) != 0) {
1016 return eNone;
1017 }
1018
1019 while (linestring != NULL && linestring [0] != EOF && contig->num_base_segs < num_base_segs
1020 && linestring [0] == 'B' && linestring [1] == 'S' && isspace (linestring [2])) {
1021 contig->base_segs[contig->num_base_segs++] = s_ReadBaseSeg (linestring);
1022 free (linestring);
1023 linestring = readfunc (userdata);
1024 }
1025 if (contig->num_base_segs < num_base_segs) {
1026 return eTooFew;
1027 } else if (linestring != NULL && linestring [0] != EOF && ! s_LineIsEmptyButNotEof (linestring)) {
1028 return eTooMany;
1029 } else {
1030 return eJustRight;
1031 }
1032 }
1033
1034
1035 static char s_IsEquivN (char ch)
1036 {
1037 if (ch == 'N' || ch == 'X') {
1038 return 1;
1039 } else {
1040 return 0;
1041 }
1042 }
1043
1044
1045 /* Terminal Ns will always be trimmed in the GenBank records */
1046 static void s_AdjustContigReadForTerminalNs (TContigReadPtr read)
1047 {
1048 char * cp_src;
1049 char * cp_dst;
1050 int len = 0;
1051
1052 if (read == NULL || read->read_seq == NULL) return;
1053 cp_src = read->read_seq;
1054 while (s_IsEquivN(*cp_src)) {
1055 len++;
1056 cp_src++;
1057 }
1058 if (len > 0) {
1059 read->cons_start += len;
1060 cp_dst = read->read_seq;
1061 while (*cp_src != 0) {
1062 *cp_dst = *cp_src;
1063 cp_dst++;
1064 cp_src++;
1065 }
1066 *cp_dst = 0;
1067 }
1068 len = strlen (read->read_seq);
1069 cp_src = read->read_seq + len - 1;
1070 while (cp_src >= read->read_seq && s_IsEquivN(*cp_src)) {
1071 *cp_src = 0;
1072 cp_src--;
1073 }
1074 }
1075
1076
1077 /* Terminal Ns will always be trimmed by the GenBank record */
1078 static void s_AdjustContigForTerminalNs (TContigPtr contig)
1079 {
1080 char * cp_src;
1081 char * cp_dst;
1082 int len = 0, i;
1083
1084 if (contig == NULL || contig->consensus_seq == NULL) return;
1085 cp_src = contig->consensus_seq;
1086 while (s_IsEquivN(*cp_src)) {
1087 len++;
1088 cp_src++;
1089 }
1090 if (len > 0) {
1091 /* adjust quality scores */
1092 if (contig->qual_scores != NULL) {
1093 contig->num_qual_scores -= len;
1094 for (i = 0; i < contig->num_qual_scores; i++) {
1095 contig->qual_scores[i] = contig->qual_scores [i + len];
1096 }
1097 }
1098 /* adjust reads */
1099 if (contig->reads != NULL) {
1100 for (i = 0; i < contig->num_reads; i++) {
1101 if (contig->reads[i] != NULL) {
1102 contig->reads[i]->cons_start -= len;
1103 }
1104 }
1105 }
1106 /* adjust consensus sequence */
1107 cp_dst = contig->consensus_seq;
1108 while (*cp_src != 0) {
1109 *cp_dst = *cp_src;
1110 cp_dst++;
1111 cp_src++;
1112 }
1113 *cp_dst = 0;
1114 contig->consensus_assem_len -= len;
1115 }
1116 /* trim 3' Ns */
1117 len = 0;
1118 cp_src = contig->consensus_seq + contig->consensus_assem_len - 1;
1119 while (cp_src >= contig->consensus_seq && s_IsEquivN(*cp_src)) {
1120 *cp_src = 0;
1121 cp_src--;
1122 contig->consensus_assem_len--;
1123 len++;
1124 }
1125 /* truncate quality scores if 3' Ns trimmed */
1126 if (len > 0 && contig->qual_scores != NULL) {
1127 contig->num_qual_scores -= len;
1128 }
1129 }
1130
1131
1132 /* Clips the sequence read in according to the QA clipping.
1133 * The real coordinates will be recovered when an alignment is generated between
1134 * the sequence in the structure and the sequence downloaded from the Trace Archive.
1135 */
1136 static char ApplyQALineToRead (TContigReadPtr read, char *linestring, char *id, char *has_errors)
1137 {
1138 char *cp;
1139 int values[4];
1140 int i = 0;
1141
1142 if (read == NULL || linestring == NULL) {
1143 PrintACEFormatErrorXML ("File end where QA line should be", id, has_errors);
1144 return 0;
1145 }
1146
1147 cp = linestring;
1148 if (*cp != 'Q') {
1149 PrintACEFormatErrorXMLStart (id, has_errors);
1150 printf ("Expected QA line, found %s", linestring);
1151 PrintACEFormatErrorXMLEnd ();
1152 return 0;
1153 }
1154 cp++;
1155 if (*cp != 'A') {
1156 PrintACEFormatErrorXMLStart (id, has_errors);
1157 printf ("Expected QA line, found %s", linestring);
1158 PrintACEFormatErrorXMLEnd ();
1159 return 0;
1160 }
1161 cp++;
1162 while (*cp != 0 && i < 4) {
1163 while (isspace (*cp)) {
1164 cp++;
1165 }
1166 if (*cp != '-' && !isdigit (*cp)) {
1167 PrintACEFormatErrorXML ("Found non-number on QA line", id, has_errors);
1168 return 0;
1169 }
1170 values[i] = atoi (cp);
1171 i++;
1172 while (*cp == '-' || isdigit (*cp)) {
1173 cp++;
1174 }
1175 }
1176 if (*cp != 0 || i < 4) {
1177 PrintACEFormatErrorXML ("Fewer than four numbers on line", id, has_errors);
1178 return 0;
1179 }
1180 if (values[0] > 0 || values[2] > 0) {
1181 if (values[0] > values[2]) {
1182 read->read_assem_start = values[0];
1183 } else {
1184 read->read_assem_start = values[2];
1185 }
1186 }
1187
1188 if (values[1] > 0 && values[3] > 0) {
1189 if (values[1] < values[3]) {
1190 read->read_assem_stop = values[1];
1191 } else {
1192 read->read_assem_stop = values[3];
1193 }
1194 } else if (values[1] > 0) {
1195 read->read_assem_stop = values[1];
1196 } else if (values[3] > 0) {
1197 read->read_assem_stop = values[3];
1198 }
1199
1200 /* adjust first gap position for start */
1201 if (read->read_assem_start > 1 && read->gaps != NULL && read->gaps->num_gaps > 0 && read->gaps->gap_offsets != NULL) {
1202 read->gaps->gap_offsets[0] -= read->read_assem_start - 1;
1203 }
1204
1205 return 1;
1206 }
1207
1208
1209 /* calculate gap info for consensus sequence */
1210 /* calculate cons_stop positions and tiling positions for each read */
1211 static void s_CalculateContigOffsets (TContigPtr contig)
1212 {
1213 int i;
1214
1215 if (contig == NULL) return;
1216
1217 for (i = 0; i < contig->num_reads; i++) {
1218 contig->reads[i]->tiling_start = contig->reads[i]->read_assem_start + contig->reads[i]->cons_start;
1219 contig->reads[i]->tiling_stop = contig->reads[i]->read_assem_stop + contig->reads[i]->cons_start;
1220 contig->reads[i]->cons_stop = SeqPosFromTilingPos (contig->reads[i]->tiling_stop - 1, contig->gaps) + 1;
1221 contig->reads[i]->read_start = SeqPosFromTilingPos(contig->reads[i]->read_assem_start - 1, contig->reads[i]->gaps) + 1;
1222 contig->reads[i]->read_stop = SeqPosFromTilingPos(contig->reads[i]->read_assem_stop - 1, contig->reads[i]->gaps) + 1;
1223 }
1224
1225 }
1226
1227
1228 static int s_GetUngappedSeqLen (char *str, char *gap_chars)
1229 {
1230 int len = 0;
1231
1232 if (str == NULL) return 0;
1233 while (*str != 0) {
1234 if (!s_IsGapChar (*str, gap_chars)) {
1235 len++;
1236 }
1237 str++;
1238 }
1239 return len;
1240 }
1241
1242
1243 static char * s_AddToTagComment (char *orig, char *extra)
1244 {
1245 char * tag = NULL;
1246 int tag_len;
1247
1248 if (orig == NULL) {
1249 tag = extra;
1250 } else {
1251 tag_len = strlen (orig) + strlen (extra) + 1;
1252 tag = malloc (sizeof (char) * (tag_len + 1));
1253 strcpy (tag, orig);
1254 strcat (tag, "\n");
1255 strcat (tag, extra);
1256 free (orig);
1257 free (extra);
1258 }
1259 return tag;
1260 }
1261
1262
1263 static char * s_ReadTagComment
1264 (FReadLineFunction readfunc,
1265 void * userdata)
1266 {
1267 char *linestring;
1268 char *tag = NULL;
1269 char *cp = NULL;
1270 char *tmp;
1271 int tag_len = 0, end_len;
1272
1273 linestring = readfunc (userdata);
1274 while (linestring != NULL && linestring [0] != EOF && (cp = strchr (linestring, '}')) == NULL) {
1275 if (tag == NULL) {
1276 tag_len = strlen (linestring);
1277 tag = malloc (sizeof (char) * (tag_len + 1));
1278 strcpy (tag, linestring);
1279 } else {
1280 tag_len = tag_len + strlen (linestring) + 1;
1281 tmp = malloc (sizeof (char) * (tag_len + 1));
1282 strcpy (tmp, tag);
1283 strcat (tmp, "\n");
1284 strcat (tmp, linestring);
1285 free (tag);
1286 tag = tmp;
1287 }
1288 free (linestring);
1289 linestring = readfunc (userdata);
1290 }
1291 if (cp != NULL && cp > linestring) {
1292 end_len = cp - linestring;
1293 tag_len = tag_len + end_len + 1;
1294 tmp = malloc (sizeof (char) * (tag_len + 1));
1295 strcpy (tmp, tag);
1296 strcat (tmp, "\n");
1297 strncat (tmp, linestring, end_len);
1298 tmp[tag_len] = 0;
1299 free (tag);
1300 tag = tmp;
1301 }
1302 if (linestring != NULL) {
1303 free (linestring);
1304 }
1305
1306 return tag;
1307 }
1308
1309
1310 /* Reads the portion of and ACE file for a single contig, including the reads */
1311 static TContigPtr s_ReadContig
1312 (char ** initline,
1313 FReadLineFunction readfunc,
1314 void * userdata,
1315 char make_qual_scores,
1316 char * has_errors)
1317 {
1318 char *linestring;
1319 char *firstline;
1320 char *cp;
1321 int len = 0, read_num = 0, num_base_segs = 0;
1322 EFound val;
1323 char found_comp_char = 0;
1324 TContigPtr contig = NULL;
1325
1326 if (initline == NULL) return NULL;
1327 firstline = *initline;
1328 if (firstline == NULL || readfunc == NULL) return NULL;
1329
1330 if (firstline [0] != 'C' || firstline [1] != 'O' || ! isspace (firstline [2])) {
1331 return NULL;
1332 }
1333
1334 contig = ContigNew ();
1335 len = strlen (firstline + 3);
1336 contig->consensus_id = malloc (len + 1);
1337 strcpy (contig->consensus_id, firstline + 3);
1338
1339 cp = contig->consensus_id;
1340 while (*cp != 0 && !isspace (*cp)) {
1341 cp++;
1342 }
1343 if (isspace (*cp)) {
1344 *cp = 0;
1345 cp++;
1346 contig->consensus_assem_len = atoi (cp);
1347 while (isdigit (*cp)) {
1348 cp++;
1349 }
1350 if (isspace (*cp)) {
1351 cp++;
1352 contig->num_reads = atoi (cp);
1353 while (isdigit (*cp)) {
1354 cp++;
1355 }
1356 if (isspace (*cp)) {
1357 cp++;
1358 num_base_segs = atoi (cp);
1359 while (isdigit (*cp)) {
1360 cp++;
1361 }
1362 if (isspace (*cp)) {
1363 cp++;
1364 found_comp_char = 1;
1365 if (*cp == 'C') {
1366 contig->is_complement = 1;
1367 } else {
1368 contig->is_complement = 0;
1369 }
1370 }
1371 }
1372 }
1373 }
1374 if (contig->consensus_assem_len == 0 || contig->num_reads == 0 || !found_comp_char) {
1375 PrintACEFormatErrorXML ("Error in consensus line", contig->consensus_id, has_errors);
1376 ContigFree (contig);
1377 return NULL;
1378 }
1379
1380 /* now copy in sequence data */
1381 contig->consensus_seq = s_ReadSequenceFromFile (contig->consensus_assem_len, readfunc, userdata, contig->consensus_id, has_errors);
1382 if (contig->consensus_seq == NULL) {
1383 ContigFree (contig);
1384 return NULL;
1385 }
1386
1387 /* record actual length of consensus seq */
1388 contig->consensus_seq_len = s_GetUngappedSeqLen (contig->consensus_seq, "*");
1389
1390 /* calculate gap info */
1391 contig->gaps = GapInfoFromSequenceString (contig->consensus_seq, "*");
1392
1393 /* read quality scores */
1394 if (make_qual_scores) {
1395 val = s_ReadQualScores (contig, readfunc, userdata);
1396 if (val != eNone && val != eJustRight) {
1397 s_ReportFound (val, "quality scores", contig->consensus_id, has_errors);
1398 ContigFree (contig);
1399 return NULL;
1400 }
1401 } else {
1402 s_SkipQualScores (readfunc, userdata);
1403 }
1404
1405 /* collect reads */
1406 val = s_ReadAFLines (contig, readfunc, userdata, &linestring);
1407 if (val != eJustRight) {
1408 s_ReportFound (val, "AF lines", contig->consensus_id, has_errors);
1409 ContigFree (contig);
1410 if (linestring != NULL) free (linestring);
1411 return NULL;
1412 }
1413
1414 if (num_base_segs > 0) {
1415 val = s_ReadBaseSegs (contig, num_base_segs, linestring, readfunc, userdata);
1416 if (val != eJustRight) {
1417 s_ReportFound (val, "base segments", contig->consensus_id, has_errors);
1418 ContigFree (contig);
1419 return NULL;
1420 }
1421 }
1422
1423
1424 read_num = 0;
1425 linestring = readfunc (userdata);
1426 while (linestring != NULL && linestring [0] != EOF) {
1427 if (linestring [0] == 'R' && linestring[1] == 'D' && isspace (linestring [2])) {
1428 len = strlen (contig->reads[read_num]->read_id);
1429 if (strncmp (linestring + 3, contig->reads[read_num]->read_id, len) != 0
1430 || !isspace (linestring [3 + len])) {
1431 PrintACEFormatErrorXML ("Read IDs out of order!", contig->consensus_id, has_errors);
1432 ContigFree (contig);
1433 return NULL;
1434 }
1435 len = atoi (linestring + 3 + len);
1436 contig->reads[read_num]->read_seq = s_ReadSequenceFromFile (len, readfunc, userdata, contig->reads[read_num]->read_id, has_errors);
1437 if (contig->reads[read_num]->read_seq == NULL) {
1438 ContigFree (contig);
1439 return NULL;
1440 }
1441 s_AdjustContigReadForTerminalNs (contig->reads[read_num]);
1442 contig->reads[read_num]->read_len = s_GetUngappedSeqLen (contig->reads[read_num]->read_seq, "*");
1443 contig->reads[read_num]->gaps = GapInfoFromSequenceString (contig->reads[read_num]->read_seq, "*");
1444 read_num++;
1445 } else if (linestring [0] == 'Q' && linestring[1] == 'A' && isspace (linestring[2])) {
1446 if (read_num < 1) {
1447 PrintACEFormatErrorXML ("Found QA line before RD!", contig->consensus_id, has_errors);
1448 ContigFree (contig);
1449 return NULL;
1450 } else if (!ApplyQALineToRead (contig->reads[read_num - 1], linestring, contig->reads[read_num - 1]->read_id, has_errors)) {
1451 PrintACEFormatErrorXML ("Error in QA line format!", contig->reads[read_num - 1]->read_id, has_errors);
1452 ContigFree (contig);
1453 return NULL;
1454 }
1455 } else if (linestring[0] == 'D' && linestring[1] == 'S' && isspace (linestring[2])) {
1456 /* skip DS lines */
1457 } else if (strncmp (linestring, "RT{", 3) == 0) {
1458 contig->reads[read_num - 1]->tag = s_AddToTagComment (contig->reads[read_num - 1]->tag, s_ReadTagComment (readfunc, userdata));
1459 } else if (strncmp (linestring, "WR{", 3) == 0) {
1460 contig->reads[read_num - 1]->tag = s_AddToTagComment (contig->reads[read_num - 1]->tag, s_ReadTagComment (readfunc, userdata));
1461 } else if (strncmp (linestring, "CT{", 3) == 0) {
1462 contig->tag = s_AddToTagComment (contig->tag, s_ReadTagComment (readfunc, userdata));
1463 } else if (strncmp (linestring, "WA{", 3) == 0) {
1464 contig->tag = s_AddToTagComment (contig->tag, s_ReadTagComment (readfunc, userdata));
1465 } else if (linestring[0] != 0) {
1466 /* found next line */
1467 *initline = linestring;
1468 s_AdjustContigForTerminalNs (contig);
1469 s_CalculateContigOffsets (contig);
1470 return contig;
1471 }
1472 free (linestring);
1473 linestring = readfunc (userdata);
1474 }
1475 *initline = NULL;
1476 s_AdjustContigForTerminalNs (contig);
1477 s_CalculateContigOffsets (contig);
1478 return contig;
1479 }
1480
1481
1482 /* Used to detect errors in ACE file formatting */
1483 static char s_UnexpectedLineBetweenContigs (char *linestring)
1484 {
1485 if (linestring == NULL) {
1486 return 0;
1487 } else if (linestring [0] == 'A' && linestring [1] == 'F') {
1488 return 1;
1489 } else if (linestring [0] == 'R' && linestring [1] == 'D') {
1490 return 1;
1491 } else {
1492 return 0;
1493 }
1494 }
1495
1496
1497 /* This is the main function for reading in an ACE file */
1498 extern TACEFilePtr
1499 ReadACEFile
1500 (FReadLineFunction readfunc,
1501 void * userdata,
1502 char make_qual_scores,
1503 char * has_errors)
1504 {
1505 char * linestring;
1506 TACEFilePtr afp;
1507 char * cp;
1508 int contig_num = 0, read_num = 0;
1509 int num_reads_expected = 0;
1510
1511 if (readfunc == NULL) {
1512 return NULL;
1513 }
1514
1515 afp = ACEFileNew ();
1516 if (afp == NULL) {
1517 return NULL;
1518 }
1519
1520 linestring = readfunc (userdata);
1521
1522 while (linestring != NULL && linestring [0] != EOF) {
1523 if (linestring [0] == 'A' && linestring [1] == 'S' && isspace (linestring [2])) {
1524 if (num_reads_expected > 0) {
1525 PrintACEFormatErrorXML ("Two file header lines!", NULL, has_errors);
1526 ACEFileFree (afp);
1527 free (linestring);
1528 return NULL;
1529 }
1530 /* first line in file, number of contigs */
1531 cp = linestring + 3;
1532 afp->num_contigs = atoi (cp);
1533 afp->contigs = malloc (afp->num_contigs * sizeof (TContigPtr));
1534 if (afp->contigs == NULL) {
1535 PrintACEFormatErrorXML ("Memory allocation failed!", NULL, has_errors);
1536 free (linestring);
1537 ACEFileFree (afp);
1538 return NULL;
1539 }
1540 while (isdigit (*cp)) {
1541 cp++;
1542 }
1543 num_reads_expected = atoi (cp);
1544 free (linestring);
1545 linestring = readfunc (userdata);
1546 } else if (linestring [0] == 'C' && linestring [1] == 'O' && isspace (linestring [2])) {
1547 if (contig_num >= afp->num_contigs) {
1548 PrintACEFormatErrorXML ("Too many contigs!", NULL, has_errors);
1549 free (linestring);
1550 ACEFileFree (afp);
1551 return NULL;
1552 }
1553 afp->contigs[contig_num] = s_ReadContig (&linestring, readfunc, userdata, make_qual_scores, has_errors);
1554 if (afp->contigs[contig_num] == NULL) {
1555 PrintACEFormatErrorXMLStart (NULL, has_errors);
1556 printf ("Unable to read contig (%d)", contig_num);
1557 PrintACEFormatErrorXMLEnd ();
1558 ACEFileFree (afp);
1559 return NULL;
1560 }
1561 read_num += afp->contigs[contig_num]->num_reads;
1562 contig_num++;
1563 } else if (s_UnexpectedLineBetweenContigs (linestring)) {
1564 PrintACEFormatErrorXMLStart (NULL, has_errors);
1565 printf ("Unexpected line after contig %d:%s", read_num, linestring);
1566 PrintACEFormatErrorXMLEnd ();
1567 free (linestring);
1568 ACEFileFree (afp);
1569 return NULL;
1570 } else {
1571 free (linestring);
1572 linestring = readfunc (userdata);
1573 }
1574 }
1575 if (contig_num < afp->num_contigs) {
1576 PrintACEFormatErrorXML ("Not enough contigs!", NULL, has_errors);
1577 ACEFileFree (afp);
1578 afp = NULL;
1579 } else if (read_num < num_reads_expected) {
1580 PrintACEFormatErrorXML ("Not enough reads!", NULL, has_errors);
1581 ACEFileFree (afp);
1582 afp = NULL;
1583 }
1584
1585 return afp;
1586 }
1587
1588
1589 /* This function writes out sequence characters, 60 per line. */
1590 static void s_WriteSeq (FILE *fp, char *seq)
1591 {
1592 int i;
1593 char * cp;
1594
1595 if (fp == NULL || seq == NULL) return;
1596 cp = seq;
1597 while (*cp != 0) {
1598 for (i = 0; i < 60 && *cp != 0; i++, cp++) {
1599 fprintf (fp, "%c", *cp);
1600 }
1601 fprintf (fp, "\n");
1602 }
1603 }
1604
1605
1606 /* This function writes out quality scores in the ACE file format. */
1607 static void s_WriteQualScores (FILE *fp, TContigPtr contig)
1608 {
1609 int q_pos, line_pos;
1610
1611 if (fp == NULL || contig == NULL || contig->num_qual_scores == 0) return;
1612
1613 fprintf (fp, "BQ\n");
1614 q_pos = 0;
1615 while (q_pos < contig->num_qual_scores) {
1616 line_pos = 0;
1617 while (line_pos < 60 && q_pos < contig->num_qual_scores) {
1618 if (contig->consensus_seq[q_pos] != '*') {
1619 fprintf (fp, "%d ", contig->qual_scores[q_pos]);
1620 line_pos++;
1621 }
1622 q_pos++;
1623 }
1624 fprintf (fp, "\n");
1625 }
1626 }
1627
1628
1629 /* NOTE - this file does not provide all of the information required for an ACE file. */
1630 static void s_WriteContig (FILE *fp, TContigPtr contig)
1631 {
1632 int i;
1633
1634 if (contig == NULL) return;
1635
1636 fprintf (fp, "CO %s %d %d\n\n", contig->consensus_id, contig->consensus_assem_len, contig->num_reads);
1637 s_WriteSeq (fp, contig->consensus_seq);
1638 fprintf (fp, "\n");
1639
1640 s_WriteQualScores (fp, contig);
1641 fprintf (fp, "\n");
1642
1643 for (i = 0; i < contig->num_reads; i++) {
1644 fprintf (fp, "AF %s %c %d\n", contig->reads[i]->read_id,
1645 contig->reads[i]->is_complement ? 'C' : 'U',
1646 contig->reads[i]->cons_start + 1);
1647 }
1648 fprintf (fp, "\n");
1649 for (i = 0; i < contig->num_reads; i++) {
1650 fprintf (fp, "RD %s %d\n", contig->reads[i]->read_id, strlen (contig->reads[i]->read_seq));
1651 s_WriteSeq (fp, contig->reads[i]->read_seq);
1652 fprintf (fp, "\n");
1653 }
1654
1655 }
1656
1657
1658 /* NOTE - This generates an incomplete ACE file - the data structure we currently use
1659 * does not provide enough data to create a complete ACE file.
1660 */
1661 extern void WriteACEFile (FILE *fp, TACEFilePtr afp)
1662 {
1663 int i, tot_reads = 0;
1664 if (fp == NULL || afp == NULL) return;
1665
1666 for (i = 0; i < afp->num_contigs; i++) {
1667 tot_reads += afp->contigs[i]->num_reads;
1668 }
1669 fprintf (fp, "AS %d %d\n\n", afp->num_contigs, tot_reads);
1670
1671 for (i = 0; i < afp->num_contigs; i++) {
1672 s_WriteContig (fp, afp->contigs[i]);
1673 }
1674 }
1675
1676
1677 /* This function generates a string that uses the FASTA+GAP method for specifying gaps
1678 * (dashes instead of asterisks)
1679 */
1680 static char *
1681 s_AlignmentSeqFromContigSeq
1682 (char *contig_seq,
1683 int cons_start,
1684 int aln_len,
1685 int read_start,
1686 int read_stop)
1687 {
1688 char * aln_seq;
1689 char * cp;
1690 int pos = 0, i;
1691
1692 aln_seq = malloc (sizeof (char) * (aln_len + 1));
1693 /* pad start */
1694 for (i = 0; i < cons_start; i++) {
1695 aln_seq[pos] = '-';
1696 pos++;
1697 }
1698 cp = contig_seq;
1699 if (read_start > 1) {
1700 i = 1;
1701 while (*cp != 0 && i < read_start) {
1702 aln_seq[pos] = '-';
1703 pos++;
1704 cp++;
1705 i++;
1706 }
1707 }
1708 while (*cp != 0 && (read_stop < 1 || i < read_stop)) {
1709 if (*cp == '*') {
1710 aln_seq[pos] = '-';
1711 } else {
1712 aln_seq[pos] = *cp;
1713 }
1714 pos++;
1715 cp++;
1716 i++;
1717 }
1718 while (pos < aln_len) {
1719 aln_seq[pos] = '-';
1720 pos++;
1721 }
1722 aln_seq[pos] = 0;
1723 return aln_seq;
1724 }
1725
1726
1727 static char * s_FarPointerIdFromReadId (char * read_id)
1728 {
1729 char * far_id = NULL;
1730 far_id = malloc (sizeof (char) * (strlen (read_id) + 1));
1731 strcpy (far_id, read_id);
1732 return far_id;
1733 }
1734
1735
1736 /* This function generates an intermediate data format suitable for generating
1737 * a SeqEntry with an alignment.
1738 */
1739 extern TAlignmentFilePtr AlignmentFileFromContig (TContigPtr contig)
1740 {
1741 TAlignmentFilePtr aln;
1742 int i, len;
1743 int consensus_pad = 0, pad, end_pad = 0, aln_len;
1744
1745 if (contig == NULL) return NULL;
1746
1747 aln = AlignmentFileNew ();
1748 aln->num_sequences = 1 + contig->num_reads;
1749 aln->num_organisms = 0;
1750 aln->num_deflines = 0;
1751 aln->num_segments = 1;
1752 aln->ids = malloc (sizeof (char *) * aln->num_sequences);
1753 aln->sequences = malloc (sizeof (char *) * aln->num_sequences);
1754 aln->organisms = NULL;
1755 aln->deflines = NULL;
1756 aln->align_format_found = 1;
1757 /* calculate padding for consensus */
1758 for (i = 0; i < contig->num_reads; i++) {
1759 if (contig->reads[i]->cons_start < 0) {
1760 pad = 0 - contig->reads[i]->cons_start;
1761 if (consensus_pad < pad) {
1762 consensus_pad = pad;
1763 }
1764 }
1765 len = contig->reads[i]->cons_start + strlen (contig->reads[i]->read_seq);
1766 if (len > contig->consensus_assem_len) {
1767 pad = len - contig->consensus_assem_len;
1768 if (pad > end_pad) {
1769 end_pad = pad;
1770 }
1771 }
1772 }
1773 aln_len = consensus_pad + contig->consensus_assem_len + end_pad;
1774 /* seq for consensus */
1775 len = strlen (contig->consensus_id);
1776 aln->ids[0] = malloc (sizeof (char) * (len + 1));
1777 strcpy (aln->ids[0], contig->consensus_id);
1778 aln->sequences[0] = s_AlignmentSeqFromContigSeq (contig->consensus_seq,
1779 consensus_pad,
1780 aln_len, 0, 0);
1781 for (i = 1; i < aln->num_sequences; i++) {
1782 len = strlen (contig->reads[i - 1]->read_id);
1783 aln->ids[i] = s_FarPointerIdFromReadId (contig->reads[i - 1]->read_id);
1784 aln->sequences[i] = s_AlignmentSeqFromContigSeq (contig->reads[i - 1]->read_seq,
1785 consensus_pad + contig->reads[i - 1]->cons_start,
1786 aln_len,
1787 contig->reads[i - 1]->read_assem_start,
1788 contig->reads[i - 1]->read_assem_stop);
1789 }
1790 return aln;
1791 }
1792
1793
1794 /* The Trace Archive Gap String is a list of the number of nucleotides to skip before adding the next gap */
1795 extern char * TraceArchiveGapStringFromACESequence (char *seq_str)
1796 {
1797 char *cp;
1798 char * gap_str = NULL;
1799 char * print_pos;
1800 int len = 0, pos;
1801
1802 if (seq_str == NULL) return NULL;
1803
1804 /* first determine length of gap string */
1805 cp = seq_str;
1806 while (*cp != 0) {
1807 if (*cp == '*' || *cp == '-') {
1808 len++;
1809 }
1810 cp++;
1811 }
1812 len = 15 * len + 1;
1813 gap_str = malloc (sizeof (char) * len);
1814 cp = seq_str;
1815 print_pos = gap_str;
1816 pos = 0;
1817 while (*cp != 0) {
1818 if (*cp == '*' || *cp == '-') {
1819 sprintf (print_pos, "%d,", pos);
1820 print_pos += strlen (print_pos);
1821 pos = 0;
1822 } else {
1823 pos++;
1824 }
1825 cp++;
1826 }
1827 /* trim final comma */
1828 print_pos[strlen(print_pos) - 1] = 0;
1829 return gap_str;
1830 }
1831
1832
1833 /* NOTE - These functions are currently incomplete */
1834 extern void WriteTraceArchiveRead (FILE *fp, TContigReadPtr read)
1835 {
1836 char *cp;
1837 if (fp == NULL || read == NULL) {
1838 return;
1839 }
1840
1841 fprintf (fp, "<trace>\n");
1842 fprintf (fp, "<trace_name>%s</trace_name>\n", read->read_id);
1843 fprintf (fp, "<traceconsensus>");
1844 cp = read->read_seq;
1845 while (*cp != 0) {
1846 if (*cp != '*') {
1847 fprintf (fp, "%c", *cp);
1848 }
1849 cp++;
1850 }
1851 fprintf (fp, "</traceconsensus>\n");
1852 cp = TraceArchiveGapStringFromACESequence (read->read_seq);
1853 fprintf (fp, "<tracegaps>%s</tracegaps>\n", cp);
1854 free (cp);
1855 fprintf (fp, "</trace>\n");
1856 }
1857
1858
1859 static int s_GetTokenLen (char *str)
1860 {
1861 char *cp;
1862 int len = 0;
1863
1864 if (str == NULL) return 0;
1865
1866 cp = str;
1867 while (*cp != 0 && !isspace (*cp)) {
1868 len++;
1869 cp++;
1870 }
1871 return len;
1872 }
1873
1874
1875 static char * s_SkipTokens (char *str, int num_tokens)
1876 {
1877 char *cp;
1878 int i;
1879
1880 if (str == NULL || num_tokens < 0) return NULL;
1881
1882 cp = str;
1883 /* skip leading whitespace */
1884 while (isspace (*cp)) {
1885 cp++;
1886 }
1887
1888 for (i = 0; i < num_tokens && *cp != 0; i++) {
1889 /* skip token */
1890 while (*cp != 0 && !isspace (*cp)) {
1891 cp++;
1892 }
1893 /* skip trailing whitespace */
1894 while (isspace (*cp)) {
1895 cp++;
1896 }
1897 }
1898 if (*cp == 0) {
1899 return NULL;
1900 } else {
1901 return cp;
1902 }
1903 }
1904
1905
1906 /* for reading other formats */
1907 extern TContigReadPtr
1908 ReadContigFromString
1909 (char *str,
1910 char **consensus_id,
1911 int id_col,
1912 int seq_col,
1913 int contig_id_col,
1914 int strand_col,
1915 int start_col,
1916 int interpret_n_col
1917 )
1918 {
1919 TContigReadPtr read = NULL;
1920 char *cp;
1921 int len, col_num = 1, n_is_gap = 0;
1922 int max_col;
1923
1924 if (str == NULL) {
1925 return NULL;
1926 }
1927
1928 max_col = id_col;
1929 if (seq_col > max_col) {
1930 max_col = seq_col;
1931 }
1932 if (contig_id_col > max_col) {
1933 max_col = contig_id_col;
1934 }
1935 if (strand_col > max_col) {
1936 max_col = strand_col;
1937 }
1938 if (start_col > max_col) {
1939 max_col = start_col;
1940 }
1941 if (interpret_n_col > max_col) {
1942 max_col = interpret_n_col;
1943 }
1944
1945 read = ContigReadNew ();
1946
1947 cp = str;
1948 len = s_GetTokenLen (cp);
1949 while (cp != NULL && *cp != 0 && col_num <= max_col) {
1950 if (id_col == col_num) {
1951 read->read_id = malloc (len + 1);
1952 strncpy (read->read_id, cp, len);
1953 read->read_id[len] = 0;
1954 } else if (seq_col == col_num) {
1955 read->read_seq = malloc (len + 1);
1956 strncpy (read->read_seq, cp, len);
1957 read->read_seq[len] = 0;
1958 read->read_len = len;
1959 } else if (contig_id_col == col_num) {
1960 if (consensus_id != NULL) {
1961 *consensus_id = malloc (len + 1);
1962 strncpy (*consensus_id, cp, len);
1963 (*consensus_id)[len] = 0;
1964 }
1965 } else if (strand_col == col_num) {
1966 if (*cp == 'R' || *cp == '-') {
1967 read->is_complement = 1;
1968 }
1969 } else if (start_col == col_num) {
1970 read->cons_start = atoi (cp);
1971 } else if (interpret_n_col == col_num) {
1972 if (*cp == 'I') {
1973 n_is_gap = 1;
1974 }
1975 }
1976 /* advance to next token */
1977 col_num++;
1978 cp = s_SkipTokens (cp, 1);
1979 len = s_GetTokenLen (cp);
1980 }
1981
1982 if (max_col > col_num) {
1983 ContigReadFree (read);
1984 read = NULL;
1985 } else {
1986 read->cons_stop = read->cons_start + read->read_len - 1;
1987 read->tiling_start = read->cons_start;
1988 read->tiling_stop = read->cons_stop;
1989 read->read_assem_start = 0;
1990 read->read_assem_stop = read->read_len - 1;
1991 read->read_start = 1;
1992 read->read_stop = read->read_len;
1993 if (n_is_gap) {
1994 /* adjust for gaps */
1995 read->gaps = GapInfoFromSequenceString (read->read_seq, "N");
1996 if (read->gaps->num_gaps > 0) {
1997 RemoveGapCharsFromSequenceString (read->read_seq, "N");
1998 read->read_stop -= read->gaps->num_gaps;
1999 read->read_len -= read->gaps->num_gaps;
2000 }
2001 }
2002 }
2003
2004 return read;
2005 }
2006
2007
2008 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromMAQString (char *str, char **consensus_id)
2009 {
2010 TContigReadPtr read = NULL;
2011
2012 read = ReadContigFromString (str, consensus_id, 1, 15, 2, 4, 3, 0);
2013 return read;
2014 }
2015
2016
2017 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandMostCompressed (char *str, char **consensus_id)
2018 {
2019 TContigReadPtr read = NULL;
2020
2021 read = ReadContigFromString (str, consensus_id, 0, 1, 0, 5, 4, 0);
2022 return read;
2023 }
2024
2025
2026 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandSanger (char *str, char **consensus_id)
2027 {
2028 TContigReadPtr read = NULL;
2029
2030 read = ReadContigFromString (str, consensus_id, 1, 2, 4, 6, 5, 0);
2031 return read;
2032 }
2033
2034
2035 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandStandalone (char *str, char **consensus_id)
2036 {
2037 TContigReadPtr read = NULL;
2038
2039 read = ReadContigFromString (str, consensus_id, 1, 2, 7, 9, 8, 10);
2040 return read;
2041 }
2042
2043
2044 #define READ_BLOCK_SIZE 50
2045
2046 typedef struct ReadList {
2047 TContigReadPtr reads[READ_BLOCK_SIZE];
2048 int num_reads;
2049 struct ReadList * next;
2050 } SReadList, * TReadListPtr;
2051
2052
2053 static TReadListPtr ReadListNew ()
2054 {
2055 TReadListPtr r;
2056
2057 r = malloc (sizeof (SReadList));
2058 r->num_reads = 0;
2059 r->next = NULL;
2060 return r;
2061 }
2062
2063
2064 static TReadListPtr ReadListFree (TReadListPtr r)
2065 {
2066 TReadListPtr r_next;
2067 int i;
2068
2069 while (r != NULL) {
2070 for (i = 0; i < r->num_reads; i++) {
2071 ContigReadFree (r->reads[i]);
2072 }
2073 r_next = r;
2074 free (r);
2075 r = r_next;
2076 }
2077 return r;
2078 }
2079
2080
2081 static TReadListPtr AddToReadList (TContigReadPtr read, TReadListPtr read_list)
2082 {
2083 if (read_list == NULL) {
2084 read_list = ReadListNew();
2085 } else {
2086 while (read_list->next != NULL && read_list->num_reads == READ_BLOCK_SIZE) {
2087 read_list = read_list->next;
2088 }
2089 if (read_list->num_reads == READ_BLOCK_SIZE) {
2090 read_list->next = ReadListNew();
2091 read_list = read_list->next;
2092 }
2093 }
2094 read_list->reads[read_list->num_reads++] = read;
2095 return read_list;
2096 }
2097
2098
2099 typedef struct ConsensusReads {
2100 TContigPtr contig;
2101 TReadListPtr read_list;
2102 TReadListPtr last_read;
2103 struct ConsensusReads * next;
2104 } SConsensusReads, * TConsensusReadsPtr;
2105
2106
2107 static TConsensusReadsPtr ConsensusReadsNew (char *consensus_id)
2108 {
2109 TConsensusReadsPtr c;
2110
2111 c = malloc (sizeof (SConsensusReads));
2112 c->contig = ContigNew ();
2113 if (consensus_id != NULL) {
2114 c->contig->consensus_id = malloc (strlen (consensus_id) + 1);
2115 strcpy (c->contig->consensus_id, consensus_id);
2116 }
2117
2118 c->read_list = NULL;
2119 c->last_read = NULL;
2120 c->next = NULL;
2121 return c;
2122 }
2123
2124
2125 static TConsensusReadsPtr ConsensusReadsFree (TConsensusReadsPtr c)
2126 {
2127 TConsensusReadsPtr c_next;
2128
2129 while (c != NULL) {
2130 c_next = c->next;
2131 ContigFree (c->contig);
2132 c->read_list = ReadListFree (c->read_list);
2133 free (c);
2134 c = c_next;
2135 }
2136 return c;
2137 }
2138
2139
2140 static void AddReadToConsensusReads (TConsensusReadsPtr c, TContigReadPtr read)
2141 {
2142 if (c != NULL && read != NULL) {
2143 c->last_read = AddToReadList (read, c->read_list);
2144 if (c->read_list == NULL) {
2145 c->read_list = c->last_read;
2146 }
2147 }
2148 }
2149
2150 #define CONSENSUS_BLOCK_SIZE 50
2151
2152 typedef struct ConsensusReadsList {
2153 TConsensusReadsPtr contigs[CONSENSUS_BLOCK_SIZE];
2154 int num_contigs;
2155 struct ConsensusReadsList * next;
2156 } SConsensusReadsList, * TConsensusReadsListPtr;
2157
2158
2159 static TConsensusReadsListPtr ConsensusReadsListNew ()
2160 {
2161 TConsensusReadsListPtr c;
2162
2163 c = malloc (sizeof (SConsensusReadsList));
2164 c->num_contigs = 0;
2165 c->next = NULL;
2166 return c;
2167 }
2168
2169
2170 static TConsensusReadsListPtr ConsensusReadsListFree (TConsensusReadsListPtr c)
2171 {
2172 TConsensusReadsListPtr c_next;
2173 int i;
2174
2175 while (c != NULL) {
2176 c_next = c->next;
2177 for (i = 0; i < c->num_contigs; i++) {
2178 c->contigs[i] = ConsensusReadsFree (c->contigs[i]);
2179 }
2180 free (c);
2181 c = c_next;
2182 }
2183 return c;
2184 }
2185
2186
2187 static TConsensusReadsPtr FindConsensusIDInConsensusReadsList (TConsensusReadsListPtr c, char *consensus_id)
2188 {
2189 int i;
2190 TConsensusReadsPtr r = NULL;
2191
2192 if (consensus_id == NULL) {
2193 return NULL;
2194 }
2195 while (c != NULL && r == NULL) {
2196 for (i = 0; i < c->num_contigs && r == NULL; i++) {
2197 if (c->contigs[i] != NULL
2198 && c->contigs[i]->contig != NULL
2199 && strcmp (c->contigs[i]->contig->consensus_id, consensus_id) == 0) {
2200 r = c->contigs[i];
2201 }
2202 }
2203 c = c->next;
2204 }
2205 return r;
2206 }
2207
2208
2209 static TConsensusReadsListPtr
2210 AddConsensusReadToConsensusReadsList
2211 (TConsensusReadsListPtr c,
2212 char * consensus_id,
2213 TContigReadPtr read)
2214 {
2215 TConsensusReadsPtr r = NULL;
2216
2217 if (c == NULL) {
2218 c = ConsensusReadsListNew ();
2219 r = ConsensusReadsNew(consensus_id);
2220 c->contigs[c->num_contigs++] = r;
2221 } else {
2222 r = FindConsensusIDInConsensusReadsList (c, consensus_id);
2223 if (r == NULL) {
2224 while (c->next != NULL && c->num_contigs == CONSENSUS_BLOCK_SIZE) {
2225 c = c->next;
2226 }
2227 if (c->num_contigs == CONSENSUS_BLOCK_SIZE) {
2228 c->next = ConsensusReadsListNew ();
2229 c = c->next;
2230 }
2231 r = ConsensusReadsNew(consensus_id);
2232 c->contigs[c->num_contigs++] = r;
2233 }
2234 }
2235 AddReadToConsensusReads (r, read);
2236
2237 return c;
2238 }
2239
2240
2241 static void MoveReadsToContigFromReadList (TContigPtr contig, TReadListPtr read_list)
2242 {
2243 TReadListPtr r;
2244 int n = 0, i;
2245
2246 if (contig == NULL) {
2247 return;
2248 }
2249
2250 for (r = read_list; r != NULL; r = r->next) {
2251 n += r->num_reads;
2252 }
2253
2254 contig->num_reads = n;
2255 contig->reads = malloc (contig->num_reads * sizeof (TContigReadPtr));
2256 n = 0;
2257
2258 for (r = read_list; r != NULL; r = r->next) {
2259 for (i = 0; i < r->num_reads; i++) {
2260 contig->reads[n++] = r->reads[i];
2261 r->reads[i] = NULL;
2262 }
2263 r->num_reads = 0;
2264 }
2265 }
2266
2267
2268 static TACEFilePtr ACEFileFromConsensusReadsList (TConsensusReadsListPtr contig_list)
2269 {
2270 TACEFilePtr afp = NULL;
2271 TConsensusReadsListPtr c;
2272 int i, n = 0;
2273
2274 if (contig_list == NULL || contig_list->num_contigs == 0) {
2275 return NULL;
2276 }
2277
2278 afp = ACEFileNew ();
2279 for (c = contig_list; c != NULL; c=c->next) {
2280 afp->num_contigs += c->num_contigs;
2281 }
2282
2283 afp->contigs = malloc (afp->num_contigs * sizeof (TContigPtr));
2284 for (c = contig_list; c != NULL; c = c->next) {
2285 for (i = 0; i < c->num_contigs; i++) {
2286 MoveReadsToContigFromReadList (c->contigs[i]->contig, c->contigs[i]->read_list);
2287 afp->contigs[n++] = c->contigs[i]->contig;
2288 c->contigs[i]->contig = NULL;
2289 c->contigs[i]->last_read = NULL;
2290 }
2291 c->num_contigs = 0;
2292 }
2293 return afp;
2294 }
2295
2296
2297 extern TACEFilePtr ReadAssemblyFile
2298 (FReadLineFunction readfunc, /* function for reading lines of
2299 * alignment file
2300 */
2301 void * fileuserdata, /* data to be passed back each time
2302 * readfunc is invoked
2303 */
2304 FReadFromStringFunction makeread_func) /* function to transform a string into a read */
2305 {
2306 TACEFilePtr afp = NULL;
2307 TContigReadPtr read;
2308 TConsensusReadsListPtr contig_list = NULL, contig_last = NULL;
2309 char *linestring;
2310 char *consensus_id = NULL;
2311
2312 if (readfunc == NULL || makeread_func == NULL) {
2313 return NULL;
2314 }
2315 linestring = readfunc (fileuserdata);
2316
2317 while (linestring != NULL && linestring [0] != EOF) {
2318 /* get ContigRead */
2319 read = makeread_func (linestring, &consensus_id);
2320
2321 /* group with other ContigReads from the same consensus_id */
2322 if (read != NULL && consensus_id != NULL) {
2323 contig_last = AddConsensusReadToConsensusReadsList (contig_last, consensus_id, read);
2324 if (contig_list == NULL) {
2325 contig_list = contig_last;
2326 }
2327 read = NULL;
2328 }
2329 if (consensus_id != NULL) {
2330 free (consensus_id);
2331 consensus_id = NULL;
2332 }
2333 ContigReadFree (read);
2334 free (linestring);
2335 linestring = readfunc (fileuserdata);
2336 }
2337
2338 afp = ACEFileFromConsensusReadsList (contig_list);
2339 contig_list = ConsensusReadsListFree (contig_list);
2340 return afp;
2341 }
2342
2343
2344 extern TACEFilePtr ReadMAQFile
2345 (FReadLineFunction readfunc, /* function for reading lines of
2346 * alignment file
2347 */
2348 void * fileuserdata) /* data to be passed back each time
2349 * readfunc is invoked
2350 */
2351 {
2352 return ReadAssemblyFile (readfunc, fileuserdata, ReadFromMAQString);
2353 }
2354
2355
2356 extern TACEFilePtr ReadElandStandaloneFile
2357 (FReadLineFunction readfunc, /* function for reading lines of
2358 * alignment file
2359 */
2360 void * fileuserdata) /* data to be passed back each time
2361 * readfunc is invoked
2362 */
2363 {
2364 return ReadAssemblyFile (readfunc, fileuserdata, ReadFromElandStandalone);
2365 }
2366
2367
2368 /* functions for writing out XML */
2369 static void WriteTraceGapsXML (TGapInfoPtr gap_info, FILE *fp)
2370 {
2371 int i;
2372
2373 if (gap_info != NULL && fp != NULL) {
2374 fprintf (fp, " <ntracegaps>%d</ntracegaps>\n", gap_info->num_gaps);
2375 if (gap_info->num_gaps > 0) {
2376 fprintf (fp, " <tracegaps source=\"INLINE\">");
2377 for (i = 0; i < gap_info->num_gaps - 1; i++) {
2378 fprintf (fp, "%d ", gap_info->gap_offsets[i]);
2379 }
2380 fprintf (fp, "%d</tracegaps>\n", gap_info->gap_offsets[gap_info->num_gaps - 1]);
2381 } else {
2382 fprintf (fp, " <tracegaps source=\"INLINE\"> </tracegaps>\n");
2383 }
2384 }
2385 }
2386
2387
2388 static void WriteTraceReadXML (TContigReadPtr read, FILE *fp)
2389 {
2390 if (read != NULL && fp != NULL) {
2391 fprintf (fp, "<trace>\n");
2392 if (read->ti > 0) {
2393 fprintf (fp, " <ti>%d</ti>\n", read->ti);
2394 }
2395 if (read->srr != NULL) {
2396 fprintf (fp, " <srr>%s</srr>\n", read->srr);
2397 }
2398 if (read->read_id != NULL) {
2399 fprintf (fp, " <trace_name>%s</trace_name>\n", read->read_id);
2400 }
2401 fprintf (fp, " <nbasecalls>%d</nbasecalls>\n", read->read_len);
2402 fprintf (fp, " <valid>\n");
2403 fprintf (fp, " <start>%d</start>\n", read->read_start);
2404 fprintf (fp, " <stop>%d</stop>\n", read->read_stop);
2405 fprintf (fp, " </valid>\n");
2406 fprintf (fp, " <tiling direction = \"%s\">\n", read->is_complement ? "REVERSE" : "FORWARD");
2407 fprintf (fp, " <start>%d</start>\n", read->tiling_start);
2408 fprintf (fp, " <stop>%d</stop>\n", read->tiling_stop);
2409 fprintf (fp, " </tiling>\n");
2410 fprintf (fp, " <traceconsensus>\n");
2411 fprintf (fp, " <start>%d</start>\n", read->cons_start);
2412 fprintf (fp, " <stop>%d</stop>\n", read->cons_stop);
2413 fprintf (fp, " </traceconsensus>\n");
2414 WriteTraceGapsXML (read->gaps, fp);
2415 fprintf (fp, "</trace>\n");
2416 }
2417 }
2418
2419
2420 extern void WriteTraceAssemblyFromContig (TContigPtr contig, FILE *fp)
2421 {
2422 int i;
2423
2424 if (contig == NULL || fp == NULL) return;
2425
2426 /* NOTE - need to add new field to TContigPtr for submitter reference, where orig ID should move to */
2427 fprintf (fp, " <contig submitter_reference=\"%s\" conformation=\"LINEAR\" type=\"NEW\">\n",
2428 contig->consensus_id == NULL ? "not supplied" : contig->consensus_id);
2429
2430 fprintf (fp, " <ntraces>%d</ntraces>\n", contig->num_reads);
2431
2432 fprintf (fp, " <nconbases>%d</nconbases>\n", contig->consensus_seq_len);
2433
2434 /* need nbasecalls */
2435
2436 if (contig->gaps == NULL) {
2437 fprintf (fp, " <ncongaps>0</ncongaps>\n");
2438 } else {
2439 fprintf (fp, " <ncongaps>%d</ncongaps>\n", contig->gaps->num_gaps);
2440 if (contig->gaps->num_gaps > 0) {
2441 fprintf (fp, " <congaps source=\"INLINE\">");
2442 for (i = 0; i < contig->gaps->num_gaps - 1; i++) {
2443 fprintf (fp, "%d ", contig->gaps->gap_offsets[i]);
2444 }
2445 fprintf (fp, "%d</congaps>\n", contig->gaps->gap_offsets[contig->gaps->num_gaps - 1]);
2446 }
2447 }
2448 fprintf (fp, " <consensus>%s</consensus>\n",
2449 contig->consensus_id == NULL ? "not supplied" : contig->consensus_id);
2450 if (contig->num_qual_scores > 0) {
2451 fprintf (fp, " <conqualities source=\"INLINE\">");
2452 for (i = 0; i < contig->num_qual_scores; i++) {
2453 fprintf (fp, "%d ", contig->qual_scores[i]);
2454 }
2455 fprintf (fp, "</conqualities>\n");
2456 }
2457
2458 for (i = 0; i < contig->num_reads; i++) {
2459 WriteTraceReadXML (contig->reads[i], fp);
2460 }
2461 fprintf (fp, " </contig>\n");
2462 }
2463
2464
2465 extern void
2466 WriteTraceAssemblyHeader
2467 (char * assembly_type,
2468 char * subref,
2469 char * center_name,
2470 int taxid,
2471 char * description,
2472 char * assembly,
2473 int num_contigs,
2474 unsigned int num_conbases,
2475 int num_reads,
2476 unsigned int num_readbases,
2477 FILE * fp)
2478 {
2479 if (fp == NULL) {
2480 return;
2481 }
2482
2483 fprintf (fp, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
2484
2485 fprintf (fp, "<assembly submitter_reference=\"%s\" type = \"%s\">\n",
2486 subref == NULL ? "Not supplied" : subref,
2487 assembly_type == NULL ? "NEW" : assembly_type);
2488 fprintf (fp, " <center_name>%s</center_name>\n", center_name == NULL ? "Not supplied" : center_name);\
2489 fprintf (fp, " <organism descriptor=\"TAXID\">%d</organism>\n", taxid);
2490 fprintf (fp, " <description>%s</description>\n", description == NULL ? "Not supplied" : description);
2491 fprintf (fp, " <structure>%s</structure>\n", assembly == NULL ? "transcript assembly" : assembly);
2492 fprintf (fp, " <ncontigs>%d</ncontigs>\n", num_contigs);
2493 fprintf (fp, " <nconbases>%u</nconbases>\n", num_conbases);
2494 fprintf (fp, " <ntraces>%d</ntraces>\n", num_reads);
2495 fprintf (fp, " <nbasecalls>%u</nbasecalls>\n", num_readbases);
2496 fprintf (fp, " <coverage>%f</coverage>\n", num_conbases == 0 ? 0 : (float) ((float) num_readbases/ (float) num_conbases));
2497 }
2498
2499
2500 extern void WriteTraceAssemblyTrailer (FILE *fp)
2501 {
2502 if (fp == NULL) {
2503 return;
2504 }
2505 fprintf (fp, "</assembly>\n");
2506 }
2507
2508
2509 extern void
2510 WriteTraceAssemblyFromAceFile
2511 (TACEFilePtr afp,
2512 char * subref,
2513 char * center_name,
2514 int taxid,
2515 char * description,
2516 FILE *fp)
2517 {
2518 int i, j, traces = 0;
2519 unsigned int conbases = 0, basecalls = 0;
2520
2521 if (afp == NULL || fp == NULL) return;
2522
2523
2524 for (i = 0; i < afp->num_contigs; i++) {
2525 conbases += afp->contigs[i]->consensus_seq_len;
2526 traces += afp->contigs[i]->num_reads;
2527 for (j = 0; j < afp->contigs[i]->num_reads; j++) {
2528 basecalls += afp->contigs[i]->reads[j]->read_len;
2529 }
2530 }
2531 WriteTraceAssemblyHeader (NULL, subref, center_name, taxid, description, NULL, afp->num_contigs, conbases, traces, basecalls, fp);
2532
2533 for (i = 0; i < afp->num_contigs; i++) {
2534 WriteTraceAssemblyFromContig (afp->contigs[i], fp);
2535 }
2536 WriteTraceAssemblyTrailer (fp);
2537 }
2538
2539
2540 extern void WriteFASTAFromContig
2541 (TContigPtr contig,
2542 FILE *fp)
2543 {
2544 int k;
2545 char *cp;
2546
2547 if (contig == NULL || fp == NULL) return;
2548
2549 fprintf (fp, ">%s\n", contig->consensus_id);
2550 cp = contig->consensus_seq;
2551 while (*cp != 0) {
2552 k = 0;
2553 while (k < 40 && *cp != 0) {
2554 if (*cp != '*') {
2555 fprintf (fp, "%c", *cp);
2556 k++;
2557 }
2558 cp++;
2559 }
2560 fprintf (fp, "\n");
2561 }
2562 fprintf (fp, "\n");
2563 }
2564
2565
2566 extern void
2567 WriteFASTAFromAceFile
2568 (TACEFilePtr afp,
2569 FILE *fp)
2570 {
2571 int i;
2572
2573 if (afp == NULL || fp == NULL) return;
2574
2575 for (i = 0; i < afp->num_contigs; i++) {
2576 WriteFASTAFromContig (afp->contigs[i], fp);
2577 }
2578 }
2579
2580
2581 #define kFASTASeqBufSize 100
2582
2583 typedef struct fastaseqbuf {
2584 char buf[kFASTASeqBufSize];
2585 int num_used;
2586 struct fastaseqbuf *next;
2587 } SFASTASeqBuf, * TFASTASeqBufPtr;
2588
2589
2590 static TFASTASeqBufPtr s_FASTASeqBufNew ()
2591 {
2592 TFASTASeqBufPtr s;
2593
2594 s = (TFASTASeqBufPtr) malloc (sizeof (SFASTASeqBuf));
2595 if (s != NULL) {
2596 s->num_used = 0;
2597 s->next = NULL;
2598 }
2599 return s;
2600 }
2601
2602
2603 static void s_FASTASeqBufFree (TFASTASeqBufPtr s)
2604 {
2605 TFASTASeqBufPtr s_next;
2606
2607 while (s != NULL) {
2608 s_next = s->next;
2609 free (s);
2610 s = s_next;
2611 }
2612 }
2613
2614
2615 static TFASTASeqBufPtr s_AddFASTAToBuf (char *line, TFASTASeqBufPtr buf)
2616 {
2617 TFASTASeqBufPtr last_buf;
2618 char *cp;
2619
2620 if (buf == NULL) {
2621 buf = s_FASTASeqBufNew();
2622 last_buf = buf;
2623 } else {
2624 last_buf = buf;
2625 while (last_buf->next != NULL) {
2626 last_buf = last_buf->next;
2627 }
2628 }
2629
2630 cp = line;
2631 while (*cp != 0 && *cp != '\r' && *cp != '\n') {
2632 while (isspace (*cp)) {
2633 cp++;
2634 }
2635 if (*cp != 0) {
2636 if (!isalpha (*cp)) {
2637 printf ("Found bad character in FASTA file!\n");
2638 s_FASTASeqBufFree (buf);
2639 buf = NULL;
2640 return buf;
2641 }
2642 if (last_buf->num_used == kFASTASeqBufSize) {
2643 last_buf->next = s_FASTASeqBufNew ();
2644 last_buf = last_buf->next;
2645 }
2646 last_buf->buf[last_buf->num_used++] = *cp;
2647 cp++;
2648 }
2649 }
2650 return buf;
2651 }
2652
2653
2654 static char * s_StripStars (char *str)
2655 {
2656 char *cp_src;
2657 char *cp_dst;
2658 char *stripped;
2659
2660 if (str == NULL) {
2661 return 0;
2662 }
2663 cp_src = str;
2664 stripped = (char *) malloc (sizeof (char) * (strlen (str) + 1));
2665 cp_dst = stripped;
2666 while (*cp_src != 0) {
2667 if (*cp_src != '*') {
2668 *cp_dst = *cp_src;
2669 cp_dst++;
2670 }
2671 cp_src++;
2672 }
2673 *cp_dst = 0;
2674 return stripped;
2675 }
2676
2677
2678 static int s_DoesFASTAMatchSeq (TFASTASeqBufPtr buf, char *trimmed_seq)
2679 {
2680 int does_match = 1, match_len, seq_len;
2681
2682 if (buf == NULL || trimmed_seq == NULL || *trimmed_seq == 0) {
2683 return 1;
2684 }
2685
2686 seq_len = strlen (trimmed_seq);
2687 while (buf != NULL && seq_len > 0 && does_match) {
2688 if (seq_len < buf->num_used) {
2689 match_len = seq_len;
2690 } else {
2691 match_len = buf->num_used;
2692 }
2693 if (strncmp (buf->buf, trimmed_seq, match_len) == 0) {
2694 buf = buf->next;
2695 trimmed_seq += match_len;
2696 seq_len -= match_len;
2697 } else {
2698 does_match = 0;
2699 }
2700 }
2701 return does_match;
2702 }
2703
2704
2705 static char s_CompLetter (char ch)
2706 {
2707 switch (ch) {
2708 case 'A':
2709 ch = 'T';
2710 break;
2711 case 'T':
2712 ch = 'A';
2713 break;
2714 case 'G':
2715 ch = 'C';
2716 break;
2717 case 'C':
2718 ch = 'G';
2719 break;
2720 }
2721 return ch;
2722 }
2723
2724
2725 static void s_RevCompSequence (char *seq)
2726 {
2727 char tmp;
2728 int len, i;
2729
2730 if (seq == NULL || *seq == 0) {
2731 return;
2732 }
2733 len = strlen (seq);
2734
2735 for (i = 0; i < len / 2; i++) {
2736 tmp = seq[i];
2737 seq[i] = s_CompLetter (seq[len - i - 1]);
2738 seq[len - i - 1] = s_CompLetter (tmp);
2739 }
2740 if (len %2 > 0) {
2741 seq[i] = s_CompLetter (seq[i]);
2742 }
2743 }
2744
2745
2746 static int s_GetSequenceOffset (TFASTASeqBufPtr buf, char *trimmed_seq, char is_complement)
2747 {
2748 int offset = 0, buf_offset;
2749 int match_found = 0;
2750 int match_len, seq_len;
2751
2752 if (buf == NULL || trimmed_seq == NULL) {
2753 return 0;
2754 }
2755
2756 trimmed_seq = s_StripStars (trimmed_seq);
2757
2758 if (is_complement) {
2759 s_RevCompSequence (trimmed_seq);
2760 }
2761
2762 seq_len = strlen (trimmed_seq);
2763
2764 while (buf != NULL && !match_found) {
2765 buf_offset = 0;
2766 while (buf_offset < buf->num_used && !match_found) {
2767 if (buf->num_used - buf_offset < seq_len) {
2768 match_len = buf->num_used - buf_offset;
2769 } else {
2770 match_len = seq_len;
2771 }
2772 if (match_len < seq_len && buf->next == NULL) {
2773 /* ran out of sequence, no match */
2774 buf_offset = buf->num_used;
2775 } else if (strncmp (buf->buf + buf_offset, trimmed_seq, match_len) == 0
2776 && s_DoesFASTAMatchSeq (buf->next, trimmed_seq + match_len)) {
2777 match_found = 1;
2778 } else {
2779 buf_offset++;
2780 offset++;
2781 }
2782 }
2783 if (!match_found) {
2784 buf = buf->next;
2785 }
2786 }
2787 free (trimmed_seq);
2788 if (match_found) {
2789 return offset;
2790 } else {
2791 return -1;
2792 }
2793 }
2794
2795
2796 #define kSeqListBufSize 100
2797
2798 typedef struct seqlistbuf {
2799 TFASTASeqBufPtr buf[kSeqListBufSize];
2800 char * id_list[kSeqListBufSize];
2801 int num_used;
2802 struct seqlistbuf *next;
2803 } SSeqListBuf, * TSeqListBufPtr;
2804
2805
2806 static TSeqListBufPtr s_SeqListBufNew ()
2807 {
2808 TSeqListBufPtr s;
2809
2810 s = (TSeqListBufPtr) malloc (sizeof (SSeqListBuf));
2811 if (s != NULL) {
2812 s->num_used = 0;
2813 s->next = NULL;
2814 }
2815 return s;
2816 }
2817
2818
2819 static void s_SeqListBufFree (TSeqListBufPtr s)
2820 {
2821 TSeqListBufPtr s_next;
2822 int i;
2823
2824 while (s != NULL) {
2825 s_next = s->next;
2826 for (i = 0; i < s->num_used; i++) {
2827 s_FASTASeqBufFree (s->buf[i]);
2828 free (s->id_list[i]);
2829 s->buf[i] = NULL;
2830 }
2831 free (s);
2832 s = s_next;
2833 }
2834 }
2835
2836
2837 static char * s_GetFASTAIdFromString (char * str)
2838 {
2839 char * cp;
2840 char * id;
2841 int len;
2842
2843 if (str == NULL) {
2844 return NULL;
2845 }
2846
2847 cp = str;
2848 cp += strspn (str, " >\t");
2849 len = strcspn (cp, " \t\r\n");
2850 if (len == 0) {
2851 return NULL;
2852 }
2853 id = (char *)malloc (len + 1);
2854 if (id == NULL) {
2855 return NULL;
2856 }
2857 strncpy (id, cp, len);
2858 id [ len ] = 0;
2859 return id;
2860 }
2861
2862
2863 static TSeqListBufPtr s_AddToSeqList (char *line, TSeqListBufPtr buf)
2864 {
2865 TSeqListBufPtr last_buf;
2866
2867 if (line == NULL) {
2868 return buf;
2869 }
2870 if (buf == NULL) {
2871 buf = s_SeqListBufNew();
2872 last_buf = buf;
2873 } else {
2874 last_buf = buf;
2875 while (last_buf->next != NULL) {
2876 last_buf = last_buf->next;
2877 }
2878 }
2879
2880 if (*line == '>') {
2881 if (last_buf->num_used == kSeqListBufSize) {
2882 last_buf->next = s_SeqListBufNew();
2883 last_buf = last_buf->next;
2884 }
2885 last_buf->buf[last_buf->num_used] = s_FASTASeqBufNew ();
2886 last_buf->id_list[last_buf->num_used] = s_GetFASTAIdFromString (line);
2887 last_buf->num_used++;
2888 } else if (last_buf->num_used > 0) {
2889 last_buf->buf[last_buf->num_used - 1] = s_AddFASTAToBuf (line, last_buf->buf[last_buf->num_used - 1]);
2890 }
2891
2892 return buf;
2893 }
2894
2895
2896 static TFASTASeqBufPtr s_GetFastaSeq (TSeqListBufPtr buf, char *id)
2897 {
2898 TFASTASeqBufPtr seq = NULL;
2899 char *cp;
2900 int i, match_len;
2901
2902 if (id == NULL) {
2903 return NULL;
2904 }
2905
2906 cp = strchr (id, '.');
2907 if (cp == NULL) {
2908 match_len = strlen (id);
2909 } else {
2910 match_len = cp - id;
2911 }
2912
2913 while (buf != NULL && seq == NULL) {
2914 for (i = 0; i < buf->num_used && seq == NULL; i++) {
2915 if (strncmp (id, buf->id_list[i], match_len) == 0) {
2916 seq = buf->buf[i];
2917 }
2918 }
2919 buf = buf->next;
2920 }
2921 return seq;
2922 }
2923
2924
2925 static TSeqListBufPtr s_ReadFastaFile (FReadLineFunction readfunc, void * userdata)
2926 {
2927 char *linestring;
2928 TSeqListBufPtr fasta = NULL;
2929
2930 if (readfunc == NULL) {
2931 return NULL;
2932 }
2933 linestring = readfunc (userdata);
2934 while (!s_IsEOF (linestring)) {
2935 fasta = s_AddToSeqList (linestring, fasta);
2936 free (linestring);
2937 linestring = readfunc (userdata);
2938 }
2939 return fasta;
2940 }
2941
2942
2943
2944
2945 #define kQualScoreBufSize 100
2946
2947 typedef struct qualscorelist {
2948 int scores[kQualScoreBufSize];
2949 int num_used;
2950 struct qualscorelist *next;
2951 } SQualScoreList, * TQualScoreListPtr;
2952
2953
2954 static TQualScoreListPtr s_QualScoreNew ()
2955 {
2956 TQualScoreListPtr s;
2957
2958 s =(TQualScoreListPtr) malloc (sizeof (SQualScoreList));
2959 if (s != NULL) {
2960 s->num_used = 0;
2961 s->next = NULL;
2962 }
2963 return s;
2964 }
2965
2966
2967 static void s_QualScoreFree (TQualScoreListPtr s)
2968 {
2969 TQualScoreListPtr s_next;
2970
2971 while (s != NULL) {
2972 s_next = s->next;
2973 free (s);
2974 s = s_next;
2975 }
2976 }
2977
2978
2979 static TQualScoreListPtr s_AddQualScores (char *line, TQualScoreListPtr scores)
2980 {
2981 TQualScoreListPtr last_score;
2982 char *cp;
2983
2984 if (scores == NULL) {
2985 scores = s_QualScoreNew();
2986 last_score = scores;
2987 } else {
2988 last_score = scores;
2989 while (last_score->next != NULL) {
2990 last_score = last_score->next;
2991 }
2992 }
2993
2994 cp = line;
2995 while (*cp != 0 && *cp != '\r' && *cp != '\n') {
2996 while (isspace (*cp)) {
2997 cp++;
2998 }
2999 if (*cp != 0) {
3000 if (!isdigit (*cp)) {
3001 printf ("Found bad character in quality scores file!\n");
3002 s_QualScoreFree (scores);
3003 scores = NULL;
3004 return scores;
3005 }
3006 if (last_score->num_used == kQualScoreBufSize) {
3007 last_score->next = s_QualScoreNew ();
3008 last_score = last_score->next;
3009 }
3010 last_score->scores[last_score->num_used++] = atoi (cp);
3011 while (isdigit (*cp)) {
3012 cp++;
3013 }
3014 }
3015 }
3016 return scores;
3017 }
3018
3019
3020 static int s_AddScoresToRead (TContigReadPtr read, TQualScoreListPtr scores, TSeqListBufPtr fasta)
3021 {
3022 int score_pos = 0;
3023 int offset = 0;
3024 int skip, score_len;
3025 char *cp;
3026 int *dst;
3027 TFASTASeqBufPtr fasta_seq;
3028
3029 if (read == NULL || scores == NULL) {
3030 return 0;
3031 }
3032
3033 if (fasta == NULL) {
3034 skip = read->read_start - 1;
3035 } else {
3036 fasta_seq = s_GetFastaSeq (fasta, read->read_id);
3037 if (fasta_seq == NULL) {
3038 printf ("Unable to locate fasta for %s\n", read->read_id);
3039 return 0;
3040 }
3041
3042 skip = s_GetSequenceOffset (fasta_seq, read->read_seq, read->is_complement);
3043 if (skip < 0) {
3044 printf ("ACE read did not match FASTA read for %s\n", read->read_id);
3045 return 0;
3046 }
3047 }
3048
3049 /* skip over scores before part used in assembly */
3050 while (scores != NULL && score_pos < skip) {
3051 if (skip - score_pos < scores->num_used) {
3052 offset = skip - score_pos;
3053 score_pos = skip;
3054 } else if (scores->next == NULL) {
3055 printf ("Not enough scores read for %s\n", read->read_id);
3056 return 0;
3057 } else {
3058 score_pos += kQualScoreBufSize;
3059 scores = scores->next;
3060 }
3061 }
3062
3063 score_len = strlen (read->read_seq);
3064 read->qual_scores = malloc (sizeof (int) * score_len);
3065
3066 if (read->is_complement) {
3067 /* need to read scores in reverse direction */
3068 cp = read->read_seq + score_len - 1;
3069 dst = read->qual_scores + score_len - 1;
3070 while (scores != NULL && read->num_qual_scores < score_len) {
3071 if (*cp == '*') {
3072 *dst = 0;
3073 } else {
3074 *dst = scores->scores[offset];
3075 offset++;
3076 }
3077 cp--;
3078 dst--;
3079 read->num_qual_scores++;
3080
3081 if (offset == kQualScoreBufSize) {
3082 scores = scores->next;
3083 offset = 0;
3084 }
3085 }
3086 } else {
3087 cp = read->read_seq;
3088 dst = read->qual_scores;
3089 while (scores != NULL && read->num_qual_scores < score_len) {
3090 if (*cp == '*') {
3091 *dst = 0;
3092 } else {
3093 *dst = scores->scores[offset];
3094 offset++;
3095 }
3096 cp++;
3097 dst++;
3098 read->num_qual_scores++;
3099
3100 if (offset == kQualScoreBufSize) {
3101 scores = scores->next;
3102 offset = 0;
3103 }
3104 }
3105 }
3106 if (read->num_qual_scores == score_len) {
3107 return 1;
3108 } else {
3109 printf ("Not enough qual scores for %s\n", read->read_id);
3110 return 0;
3111 }
3112 }
3113
3114
3115 static int s_AddQualScoresToReadsInAceFile (TACEFilePtr afp, char *id, TQualScoreListPtr scores, TSeqListBufPtr fasta)
3116 {
3117 int i, j, found = 0, match_len, rval = 1;
3118 char *cp;
3119
3120 if (afp == NULL || id == NULL) {
3121 return 0;
3122 }
3123
3124 cp = strchr (id, '.');
3125 if (cp == NULL) {
3126 match_len = strlen (id);
3127 } else {
3128 match_len = cp - id;
3129 }
3130
3131 for (i = 0; i < afp->num_contigs; i++) {
3132 for (j = 0; j < afp->contigs[i]->num_reads; j++) {
3133 if (strncmp (afp->contigs[i]->reads[j]->read_id, id, match_len) == 0) {
3134 found = 1;
3135 rval &= s_AddScoresToRead (afp->contigs[i]->reads[j], scores, fasta);
3136 }
3137 }
3138 }
3139 if (!found) {
3140 printf ("Unable to locate %s in ACE file\n", id);
3141 rval = 0;
3142 }
3143 return rval;
3144 }
3145
3146
3147 extern int
3148 AddReadQualScores
3149 (TACEFilePtr afp,
3150 FReadLineFunction readfunc,
3151 void * userdata,
3152 FReadLineFunction fasta_readfunc,
3153 void * fasta_userdata)
3154 {
3155 char *linestring;
3156 char *score_id = NULL;
3157 TQualScoreListPtr scores = NULL;
3158 TSeqListBufPtr fasta = NULL;
3159 int rval = 1;
3160
3161 if (afp == NULL || readfunc == NULL) {
3162 return 0;
3163 }
3164
3165 if (fasta_readfunc != NULL) {
3166 fasta = s_ReadFastaFile (fasta_readfunc, fasta_userdata);
3167 if (fasta == NULL) {
3168 printf ("Unable to read FASTA file\n");
3169 return 0;
3170 }
3171 }
3172
3173 linestring = readfunc (userdata);
3174 while (!s_IsEOF (linestring)) {
3175 if (linestring[0] == '>') {
3176 if (score_id != NULL && scores != NULL) {
3177 /* add previously read scores to last read */
3178 if (s_AddQualScoresToReadsInAceFile (afp, score_id, scores, fasta) == 0) {
3179 printf ("Failed to add quality scores from %s\n", score_id);
3180 rval = 0;
3181 }
3182 }
3183 s_QualScoreFree (scores);
3184 scores = NULL;
3185 free (score_id);
3186
3187 score_id = s_GetFASTAIdFromString (linestring);
3188 } else if (score_id != NULL) {
3189 scores = s_AddQualScores (linestring, scores);
3190 }
3191 free (linestring);
3192 linestring = readfunc (userdata);
3193 }
3194
3195 /* handle last set of scores read */
3196 if (score_id != NULL && scores != NULL) {
3197 /* add previously read scores to last read */
3198 if (s_AddQualScoresToReadsInAceFile (afp, score_id, scores, fasta) == 0) {
3199 printf ("Failed to add quality scores from %s\n", score_id);
3200 rval = 0;
3201 }
3202 }
3203 s_SeqListBufFree (fasta);
3204 s_QualScoreFree (scores);
3205 scores = NULL;
3206 free (score_id);
3207 return rval;
3208 }
3209
3210
3211 static int s_LetterPos (char ch) {
3212 int rval = -1;
3213
3214 switch (ch) {
3215 case 'A':
3216 rval = 0;
3217 break;
3218 case 'T':
3219 rval = 1;
3220 break;
3221 case 'G':
3222 rval = 2;
3223 break;
3224 case 'C':
3225 rval = 3;
3226 break;
3227 case '*':
3228 rval = 4;
3229 break;
3230 }
3231 return rval;
3232 }
3233
3234
3235 static int s_GetUngappedPosition (int gapped_pos, char *seq)
3236 {
3237 int pos = 0, ungapped_pos = 0;
3238 int gaps_found = 0;
3239 char *cp;
3240
3241 cp = seq;
3242 while (*cp != 0 && pos < gapped_pos ) {
3243 if (*cp == '*') {
3244 gaps_found++;
3245 } else {
3246 ungapped_pos++;
3247 }
3248 cp++;
3249 pos++;
3250 }
3251 return ungapped_pos;
3252 }
3253
3254
3255 static int s_GetQualScoreForReadPos (TContigReadPtr r, int pos)
3256 {
3257 if (r == NULL || pos < 0) {
3258 return 0;
3259 }
3260
3261 /* note - don't need to get ungapped position because 0s are inserted when qual scores
3262 * are added to the reads.
3263 */
3264 /*pos = s_GetUngappedPosition (pos, r->read_seq); */
3265 if (pos > r->num_qual_scores) {
3266 return 0;
3267 } else {
3268 return r->qual_scores[pos];
3269 }
3270 }
3271
3272
3273 extern int ReplaceConsensusSequenceFromTraces (TContigPtr contig, char only_ns)
3274 {
3275 char * consensus_buf;
3276 int * new_qual_scores = NULL;
3277 int num_qual_scores = 0;
3278 int i, k, best, letter_pos;
3279 int char_counts[5];
3280 char best_ch, ch;
3281 int num_best, sum_best;
3282 int * consensus_qual_ptr = NULL;
3283 int read_offset, len;
3284 int num_change = 0;
3285
3286 if (contig == NULL) {
3287 return 0;
3288 }
3289
3290 consensus_buf = (char *) malloc (sizeof (char) * (contig->consensus_assem_len + 1));
3291 if (contig->reads[0]->num_qual_scores > 0) {
3292 new_qual_scores = (int *) malloc (sizeof (int) * contig->consensus_seq_len);
3293 }
3294
3295 consensus_qual_ptr = contig->qual_scores;
3296
3297 for (i = 0; i < contig->consensus_assem_len; i++) {
3298 if (only_ns && contig->consensus_seq[i] != 'N') {
3299 /* just use existing consensus character */
3300 consensus_buf[i] = contig->consensus_seq[i];
3301 /* add in qual scores */
3302 if (consensus_qual_ptr != NULL && new_qual_scores != NULL && contig->consensus_seq[i] != '*') {
3303 new_qual_scores[num_qual_scores++] = *consensus_qual_ptr;
3304 }
3305 } else {
3306 for (k = 0; k < 5; k++) {
3307 char_counts[k] = 0;
3308 }
3309 best = 0;
3310 best_ch = 'N';
3311 for (k = 0; k < contig->num_reads; k++) {
3312 read_offset = i - contig->reads[k]->cons_start;
3313 len = strlen (contig->reads[k]->read_seq);
3314 if (len > read_offset
3315 && read_offset >= 0) {
3316 ch = toupper (contig->reads[k]->read_seq[read_offset]);
3317 letter_pos = s_LetterPos (ch);
3318 if (letter_pos > -1) {
3319 char_counts[letter_pos]++;
3320 if (char_counts[letter_pos] > best
3321 || (char_counts[letter_pos] == best && best_ch == '*')) {
3322 best_ch = ch;
3323 best = char_counts[letter_pos];
3324 }
3325 }
3326 }
3327 }
3328 if (toupper (consensus_buf[i]) != best_ch) {
3329 num_change++;
3330 consensus_buf[i] = best_ch;
3331 if (best_ch != '*') {
3332 /* calculate quality score */
3333 if (new_qual_scores != NULL) {
3334 sum_best = 0;
3335 num_best = 0;
3336 for (k = 0; k < contig->num_reads; k++) {
3337 if (contig->reads[k]->num_qual_scores > i - contig->reads[k]->cons_start
3338 && best_ch == toupper (contig->reads[k]->read_seq[i - contig->reads[k]->cons_start])) {
3339 num_best ++;
3340 sum_best += s_GetQualScoreForReadPos (contig->reads[k], i - contig->reads[k]->cons_start);
3341 }
3342 }
3343 if (num_best == 0) {
3344 new_qual_scores[num_qual_scores++] = 0;
3345 } else {
3346 new_qual_scores[num_qual_scores++] = sum_best / num_best;
3347 }
3348 }
3349 }
3350 }
3351 }
3352 if (consensus_qual_ptr != NULL && contig->consensus_seq[i] != '*') {
3353 consensus_qual_ptr++;
3354 }
3355 }
3356 consensus_buf[i] = 0;
3357
3358 free (contig->consensus_seq);
3359 contig->consensus_seq = consensus_buf;
3360 if (contig->qual_scores != NULL) {
3361 free (contig->qual_scores);
3362 }
3363 contig->qual_scores = new_qual_scores;
3364 contig->num_qual_scores = num_qual_scores;
3365
3366 return num_change;
3367 }
3368
3369
3370 extern void RecalculateConsensusSequences (TACEFilePtr ace_file, char only_ns)
3371 {
3372 int i;
3373
3374 if (ace_file == NULL) {
3375 return;
3376 }
3377
3378 for (i = 0; i < ace_file->num_contigs; i++) {
3379 ReplaceConsensusSequenceFromTraces(ace_file->contigs[i], only_ns);
3380 }
3381
3382 }
3383
3384
3385 extern void WriteContigQualScores (TContigPtr contig, FILE *out)
3386 {
3387 int i = 0, j;
3388
3389 if (contig == NULL || contig->qual_scores == NULL || contig->num_qual_scores < 1 || out == NULL) {
3390 return;
3391 }
3392 fprintf (out, ">%s\n", contig->consensus_id);
3393
3394 while (i < contig->num_qual_scores) {
3395 for (j = 0; j < 60 && i < contig->num_qual_scores; j++, i++) {
3396 fprintf (out, "%d ", contig->qual_scores[i]);
3397 }
3398 fprintf (out, "\n");
3399 }
3400 fprintf (out, "\n");
3401 }
3402
3403
3404 extern char
3405 ProcessLargeACEFileForContigFastaAndQualScores
3406 (FReadLineFunction readfunc,
3407 void * userdata,
3408 char make_qual_scores,
3409 char * has_errors,
3410 ProcessContigFunc process_func,
3411 void * process_data)
3412 {
3413 char * linestring;
3414 char * cp;
3415 int contig_num = 0, read_num = 0;
3416 int num_reads_expected = 0;
3417 int num_contigs = 0;
3418 TContigPtr contig = NULL;
3419 char rval = 1;
3420
3421 if (readfunc == NULL || process_func == NULL) {
3422 return 0;
3423 }
3424
3425 linestring = readfunc (userdata);
3426
3427 while (linestring != NULL && linestring [0] != EOF) {
3428 if (linestring [0] == 'A' && linestring [1] == 'S' && isspace (linestring [2])) {
3429 if (num_reads_expected > 0) {
3430 PrintACEFormatErrorXML ("Two file header lines!", NULL, has_errors);
3431 return 0;
3432 }
3433 /* first line in file, number of contigs */
3434 cp = linestring + 3;
3435 num_contigs = atoi (cp);
3436 while (isdigit (*cp)) {
3437 cp++;
3438 }
3439 num_reads_expected = atoi (cp);
3440 linestring = readfunc (userdata);
3441 } else if (linestring [0] == 'C' && linestring [1] == 'O' && isspace (linestring [2])) {
3442 if (contig_num >= num_contigs) {
3443 PrintACEFormatErrorXML ("Too many contigs!", NULL, has_errors);
3444 return 0;
3445 }
3446
3447 contig = s_ReadContig (&linestring, readfunc, userdata, make_qual_scores, has_errors);
3448 if (contig == NULL) {
3449 PrintACEFormatErrorXMLStart (NULL, has_errors);
3450 printf ("Unable to read contig (%d)", contig_num);
3451 PrintACEFormatErrorXMLEnd ();
3452 return 0;
3453 }
3454 read_num += contig->num_reads;
3455 process_func (contig, process_data);
3456 ContigFree (contig);
3457 contig = NULL;
3458 contig_num++;
3459 } else if (s_UnexpectedLineBetweenContigs (linestring)) {
3460 PrintACEFormatErrorXMLStart (NULL, has_errors);
3461 printf ("Unexpected line after contig %d", read_num);
3462 PrintACEFormatErrorXMLEnd ();
3463 return 0;
3464 } else {
3465 linestring = readfunc (userdata);
3466 }
3467 }
3468 if (contig_num < num_contigs) {
3469 PrintACEFormatErrorXML ("Not enough contigs!", NULL, has_errors);
3470 rval = 0;
3471 } else if (read_num < num_reads_expected) {
3472 PrintACEFormatErrorXML ("Not enough reads!", NULL, has_errors);
3473 rval = 0;
3474 }
3475
3476 return rval;
3477 }
3478 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |