|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/demo/copymat.c |
source navigation diff markup identifier search freetext search file search |
1 static char const rcsid[] = "$Id: copymat.c,v 6.49 2008/11/04 16:44:38 maning Exp $";
2
3 /*
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28
29 /*****************************************************************************
30
31 File name: copymat.c
32
33 Authors: Alejandro Schaffer, Sergei Shavirin
34
35 Contents: main routines for copymatrices program to convert
36 score matrices output by makematrices into a single byte-encoded file.
37
38 $Log: copymat.c,v $
39 Revision 6.49 2008/11/04 16:44:38 maning
40 add type cast to fix compilation error
41
42 Revision 6.48 2008/02/01 14:04:25 madden
43 LookupTableWrapInit prototype change
44
45 Revision 6.47 2006/11/24 19:06:15 kans
46 added include of blast_filter.h
47
48 Revision 6.46 2006/11/21 17:24:20 papadopo
49 1. rearrange headers
50 2. change lookup table type
51
52 Revision 6.45 2006/09/15 15:45:35 madden
53 Change to LookupTableWrapInit
54
55 Revision 6.44 2005/12/22 14:22:19 papadopo
56 change signature of BLAST_FillLookupTableOptions
57
58 Revision 6.43 2005/12/20 15:36:39 papadopo
59 change name of structure field
60
61 Revision 6.42 2005/05/20 18:57:51 camacho
62 Update to use new signature to BLAST_FillLookupTableOptions
63
64 Revision 6.41 2005/02/14 14:11:55 camacho
65 Changes to use SBlastScoreMatrix
66
67 Revision 6.40 2005/01/10 13:48:20 madden
68 Change to BLAST_FillInitialWordOptions prototype
69
70 Revision 6.39 2004/09/15 17:40:21 papadopo
71 change use of ListNode to use of BlastSeqLoc for lookup table creation
72
73 Revision 6.38 2004/07/12 16:30:44 papadopo
74 LookupTable->BlastLookupTable
75
76 Revision 6.37 2004/06/22 16:45:56 camacho
77 Changed the blast_type_* definitions for the EBlastProgramType enumeration from
78 algo/blast.
79
80 Revision 6.36 2004/04/23 21:11:31 papadopo
81 force the thick backbone (dumped to the RPS .loo file) to contain the number of cells assumed by RPS blast
82
83 Revision 6.35 2004/04/16 14:48:04 papadopo
84 remove unneeded argument to FillLookupTableOptions
85
86 Revision 6.34 2004/04/07 21:48:48 camacho
87 Add missing header file
88
89 Revision 6.33 2004/04/06 12:15:44 camacho
90 Rename DoubleInt -> SSeqRange
91
92 Revision 6.32 2004/03/10 20:21:27 papadopo
93 add (unused) RPS blast parameters to FillLookupTableOptions
94
95 Revision 6.31 2004/03/04 21:16:10 papadopo
96 add (unused) RPS blast parameter to FillLookupTable call
97
98 Revision 6.30 2004/01/30 20:34:45 coulouri
99 fix minor nit to FileWrite call
100
101 Revision 6.29 2004/01/26 19:40:48 coulouri
102 * Correct buffer overrun
103 * Use offset rather than pointer in LookupBackboneCell
104
105 Revision 6.28 2003/11/24 18:18:47 coulouri
106 Correction to previous fix for 64-bit irix
107
108 Revision 6.27 2003/11/21 18:01:15 ivanov
109 Added extern definition for impalaMakeFileNames()
110
111 Revision 6.26 2003/11/20 15:44:32 camacho
112 Tom Madden's changes to use lookup table contruction code from algo/blast.
113
114 Revision 6.25 2003/05/30 17:31:09 coulouri
115 add rcsid
116
117 Revision 6.24 2003/05/13 16:02:42 coulouri
118 make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
119
120 Revision 6.23 2002/11/06 21:26:47 ucko
121 RPSConcatSequences: provide useful error messages, ignore all trailing space.
122
123 Revision 6.22 2002/04/08 19:02:31 madden
124 Allow float for threshold
125
126 Revision 6.21 2001/06/07 16:45:08 shavirin
127 Removed bug related to 64bit address structure on SGI platform.
128
129 Revision 6.20 2001/04/12 19:50:12 madden
130 Comment out unrescaling of matrix
131
132 Revision 6.19 2000/11/14 23:17:52 shavirin
133 Removed serious bug under NT platform related to diffence in "w" and "wb"
134 flag when opening file on PC NT computer. Removed unused header files.
135
136 Revision 6.18 2000/11/13 21:25:22 shavirin
137 Fixed possible bug in the function RPSUpdatePointers (64 bit architecture
138 specific).
139
140 Revision 6.17 2000/11/08 18:34:19 kans
141 commented out UNIX-specific headers, included by ncbilcl.h for UNIX anyway
142
143 Revision 6.16 2000/10/20 21:46:37 shavirin
144 Added additional parameters for creating RPS database.
145
146 Revision 6.15 2000/02/29 16:27:39 shavirin
147 Added protection against matrix with scaleFactor != 1 for RPS Blast
148
149 Revision 6.14 2000/02/28 21:08:34 shavirin
150 This fixes DEC Alpha problems of RPS Blast.
151
152 Revision 6.13 2000/02/28 19:06:47 shavirin
153 Added comments for RPS Blast functions.
154 Removed unused code.
155
156 Revision 6.12 2000/02/22 19:29:06 shavirin
157 Fixed DEC Alpha specific bug in the function RPSCreateLookupFile().
158
159 Revision 6.11 2000/02/17 19:11:15 shavirin
160 Removed reference to theCacheSize.
161
162 Revision 6.10 2000/01/13 15:27:10 shavirin
163 Added concatenation of files into single file (for later formatdb).
164
165 Revision 6.9 2000/01/12 14:39:46 shavirin
166 Added parameter to set cache size in lookup table foe RPS Blast.
167
168 Revision 6.8 2000/01/07 22:31:47 shavirin
169 Lookup table header now has notice, that this is single table.
170
171 Revision 6.7 1999/12/30 18:34:20 shavirin
172 Last row in the matrix for every sequence will be gap-row (-INT2_MAX)
173
174 Revision 6.6 1999/12/29 18:49:29 shavirin
175 Changed a little format of RPS lookup tables file.
176
177
178 *****************************************************************************/
179
180
181 #include <ncbi.h>
182 #include <sequtil.h>
183 #include <seqport.h>
184 #include <tofasta.h>
185 #include <algo/blast/core/blast_aalookup.h>
186 #include <algo/blast/core/blast_stat.h>
187 #include <algo/blast/core/blast_encoding.h>
188 #include <algo/blast/core/lookup_wrap.h>
189 #include <algo/blast/core/blast_filter.h>
190
191 #ifndef MAXLINELEN
192 # define MAXLINELEN 2000
193 #endif
194 #ifndef MAX_NAME_LENGTH
195 # define MAX_NAME_LENGTH 500
196 #endif
197 #ifndef PRO_ALPHABET_SIZE
198 # define PRO_ALPHABET_SIZE 26
199 #endif
200 #ifndef SORT_THRESHOLD
201 # define SORT_THRESHOLD 20
202 #endif
203 #ifndef RPS_MAGIC_NUMBER
204 # define RPS_MAGIC_NUMBER 7702
205 #endif
206 #ifndef RPS_ARRAY_SIZE
207 # define RPS_ARRAY_SIZE 32768
208 #endif
209 /*factor used to multiply the gapped K parameter to make it
210 more accurate in most cases*/
211 #ifndef PRO_K_MULTIPLIER
212 # define PRO_K_MULTIPLIER 1.2
213 #endif
214 #include <algo/blast/core/blast_lookup.h>
215 #include <algo/blast/core/blast_options.h>
216
217 typedef Int4 ScoreRow[PRO_ALPHABET_SIZE];
218 extern Boolean LIBCALL
219 IMPALAPrintHelp PROTO((Boolean html, Int4 line_length, Char * programName,
220 FILE *outfp));
221 extern void LIBCALL
222 impalaMakeFileNames PROTO((Char * matrixDbName, Char * auxiliaryFileName,
223 Char * mmapFileName, Char * seqFileName,
224 Char *matrixFileName, Char * ckptFileName,
225 Char *directoryPrefix));
226
227 #define NUMARG (sizeof(myargs)/sizeof(myargs[0]))
228
229 static Args myargs [] = {
230 { "Database for matrix profiles", /* 0 */
231 "stdin", NULL, NULL, FALSE, 'P', ARG_FILE_IN, 0.0, 0, NULL},
232 { "Print help; overrides all other arguments", /* 1 */
233 "F", NULL, NULL, FALSE, 'H', ARG_BOOLEAN, 0.0, 0, NULL},
234 { "Create RPS mem map file(s)", /* 2 */
235 "T", NULL, NULL, FALSE, 'r', ARG_BOOLEAN, 0.0, 0, NULL},
236 { "Threshold for extending hits for RPS database", /* 3 */
237 "11", NULL, NULL, FALSE, 'f', ARG_FLOAT, 0.0, 0, NULL},
238 { "Word size for RPS database", /* 4 */
239 "3", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL},
240 };
241
242 /*counts the number of items in sequencesFile and matricesFile, assumed to
243 be one per line, and checks that the numbers are equal.
244 returns the number if equal, 0 if unequal, rewinds the file descriptors
245 before returning*/
246 static Int4 countProfiles(FILE *sequencesFile, FILE *matricesFile)
247 {
248 Int4 sequencesCount = 0; /*count for sequencesFile*/
249 Int4 matricesCount = 0; /*count for matricesFile*/
250 Char oneFileName[MAXLINELEN]; /*for reading one line per file*/
251
252 while (fgets(oneFileName,MAXLINELEN,sequencesFile))
253 sequencesCount++;
254 while (fgets(oneFileName,MAXLINELEN,matricesFile))
255 matricesCount++;
256 rewind(matricesFile);
257 rewind(sequencesFile);
258 if (sequencesCount == matricesCount)
259 return(sequencesCount);
260 else {
261 ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Sequences file has %d entries; Matrices file has %d entries; these should be equal\n", sequencesCount,matricesCount);
262 return(0);
263 }
264 }
265
266 /*free the memory associated with the position-specific score matrices*/
267 static void freeMatrix(ScoreRow *posMatrix)
268 {
269
270 MemFree(posMatrix);
271 }
272
273 /*allocate memory for the position-specific score matrices
274 enough memory is allocated to hold the largest matrix
275 the memory is reused for each different matrix*/
276 static ScoreRow * allocateMatrix(Int4 maxSequenceLength)
277 {
278 ScoreRow *returnMatrix; /*matrix to return*/
279
280 returnMatrix = (ScoreRow *) MemNew(maxSequenceLength * sizeof(ScoreRow));
281 return(returnMatrix);
282 }
283
284 /* read in a position-specific score matrix from thisMatrixFile
285 the number of positions is dbSequenceLength
286 kbp keeps the Karlin-ALtschul parameters
287 returnMatrix is the memory address where the matrix is to be stored*/
288 static void readNextMatrix(FILE * thisMatrixFile,
289 Int4 startPos, Int4 *endPos,
290 ScoreRow *bigMatrix)
291 {
292 Int4 i, r; /*row indices for sequence and matrix*/
293 Int4 lengthInFile; /*length of query*/
294 Nlm_FloatHi junkLambda, junkK, junklogK, junkH; /*used to read in useless
295 Karlin blocks*/
296 Char *sequence; /*sequence to read in*/
297 Char rowOfScores[MAXLINELEN]; /*one row of scores to be read in*/
298
299 fscanf(thisMatrixFile, "%d", &lengthInFile);
300 sequence = (Char *) MemNew((lengthInFile + 2) * sizeof(Char));
301 fscanf(thisMatrixFile,"%s",sequence);
302 MemFree(sequence);
303 /*read in useless Karlin block*/
304 fscanf(thisMatrixFile,"%le", &junkLambda);
305 fscanf(thisMatrixFile,"%le", &junkK);
306 fscanf(thisMatrixFile,"%le", &junklogK);
307 fscanf(thisMatrixFile,"%le", &junkH);
308 /*read in useless Karlin block*/
309 fscanf(thisMatrixFile,"%le", &junkLambda);
310 fscanf(thisMatrixFile,"%le", &junkK);
311 fscanf(thisMatrixFile,"%le", &junklogK);
312 fscanf(thisMatrixFile,"%le", &junkH);
313 /*read in useless Karlin block*/
314 fscanf(thisMatrixFile,"%le", &junkLambda);
315 fscanf(thisMatrixFile,"%le", &junkK);
316 fscanf(thisMatrixFile,"%le", &junklogK);
317 fscanf(thisMatrixFile,"%le\n", &junkH);
318 for(i = 0, r = startPos; i < lengthInFile; i++, r++) {
319 fgets(rowOfScores, MAXLINELEN, thisMatrixFile);
320 sscanf(rowOfScores, "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",
321 &(bigMatrix[r][0]),
322 &(bigMatrix[r][1]),
323 &(bigMatrix[r][2]),
324 &(bigMatrix[r][3]),
325 &(bigMatrix[r][4]),
326 &(bigMatrix[r][5]),
327 &(bigMatrix[r][6]),
328 &(bigMatrix[r][7]),
329 &(bigMatrix[r][8]),
330 &(bigMatrix[r][9]),
331 &(bigMatrix[r][10]),
332 &(bigMatrix[r][11]),
333 &(bigMatrix[r][12]),
334 &(bigMatrix[r][13]),
335 &(bigMatrix[r][14]),
336 &(bigMatrix[r][15]),
337 &(bigMatrix[r][16]),
338 &(bigMatrix[r][17]),
339 &(bigMatrix[r][18]),
340 &(bigMatrix[r][19]),
341 &(bigMatrix[r][20]),
342 &(bigMatrix[r][21]),
343 &(bigMatrix[r][22]),
344 &(bigMatrix[r][23]),
345 &(bigMatrix[r][24]),
346 &(bigMatrix[r][25]));
347 }
348
349 if((Boolean) myargs[2].intvalue) {
350 /* Last row in the matrix will be gap-row (-INT2_MAX) */
351
352 for(i = 0; i < 26; i++) {
353 bigMatrix[r][i] = -INT2_MAX;
354 }
355 r++;
356 }
357
358 *endPos = r;
359 }
360
361 /*read each matrix in turn and store its scores in combinedMatrix*/
362 static void readAllMatrices(FILE *matrixnamefp, ScoreRow *combinedMatrix,
363 Int4 numProfiles, CharPtr directoryPrefix,
364 Int4Ptr seqlens)
365 {
366 Int4 i; /*loop index*/
367 Char oneMatrixFileName[MAXLINELEN]; /*name of matrix file to read*/
368 FILE *thisMatrixFile; /*descriptor for one matrix file*/
369 Int4 startPos; /*starting row in big matrix for next small matrix*/
370 Int4 endPos; /*ending row + 1 in big matrix for this small matrix*/
371 Int4 prefixLength; /*length of directoryPrefix*/
372 Int4 c1,c2; /*loop indices over characters*/
373 Char relativeMatrixFileName[MAXLINELEN];
374
375 startPos = 0;
376 endPos = 0;
377 if ('\0' != directoryPrefix[0]) {
378 strcpy(oneMatrixFileName, directoryPrefix);
379 prefixLength = strlen(directoryPrefix);
380 }
381 for (i = 0; i < numProfiles; i++) {
382 if ('\0' == directoryPrefix[0])
383 fscanf(matrixnamefp,"%s", oneMatrixFileName);
384 else {
385 fscanf(matrixnamefp,"%s", relativeMatrixFileName);
386 for(c1 = prefixLength, c2 = 0; relativeMatrixFileName[c2] != '\0';
387 c1++, c2++)
388 oneMatrixFileName[c1] = relativeMatrixFileName[c2];
389 oneMatrixFileName[c1] = '\0';
390 }
391
392 if ((thisMatrixFile = FileOpen(oneMatrixFileName, "r")) == NULL) {
393 ErrPostEx(SEV_FATAL, 1, 0, "profiles: Unable to open matrix file %s\n", oneMatrixFileName);
394 return;
395 }
396 readNextMatrix(thisMatrixFile, startPos, &endPos,
397 combinedMatrix);
398
399 if(seqlens != NULL) {
400 seqlens[i] = startPos;
401 }
402
403 startPos = endPos;
404 FileClose(thisMatrixFile);
405 }
406
407 if(seqlens != NULL) { /* Last entry - is the end of last sequence */
408 seqlens[i] = startPos;
409 }
410
411 return;
412 }
413
414 /*findTotalLength scans matrixAuxiliaryFile to find the
415 total number of positions among all the position-specific matrices*/
416 static Int4 findTotalLength(FILE *matrixAuxiliaryFile, Int4 numProfiles,
417 Nlm_FloatHiPtr scalingFactor)
418 {
419 Int4 maxLength; /*maximum length of sequence*/
420 Int4 thisLength; /*length of next sequence*/
421 Int4 totalLength; /*total length to return*/
422 Int4 dbLength; /*length of database*/
423 Int4 i; /*loop index*/
424 Nlm_FloatHi Kungapped, Hungapped; /*two values to read*/
425 Char * underlyingMatrixName; /*name of matrix to read*/
426 Int4 gap_open, gap_extend; /*gap costs to skip over in reading*/
427
428 underlyingMatrixName = MemNew(MAXLINELEN * sizeof(Char));
429 fscanf(matrixAuxiliaryFile,"%s",underlyingMatrixName);
430 fscanf(matrixAuxiliaryFile,"%d\n", &gap_open);
431 fscanf(matrixAuxiliaryFile,"%d\n", &gap_extend);
432 fscanf(matrixAuxiliaryFile, "%le", &Kungapped);
433 fscanf(matrixAuxiliaryFile, "%le", &Hungapped);
434 fscanf(matrixAuxiliaryFile, "%d", &maxLength);
435 fscanf(matrixAuxiliaryFile, "%d", &dbLength);
436 fscanf(matrixAuxiliaryFile, "%lf", scalingFactor);
437 totalLength = 0;
438 for (i = 0; i < numProfiles; i++) {
439 fscanf(matrixAuxiliaryFile, "%d", &thisLength);
440 fscanf(matrixAuxiliaryFile, "%le", &Kungapped);
441 totalLength += thisLength;
442 }
443 rewind(matrixAuxiliaryFile);
444 MemFree(underlyingMatrixName);
445 return(totalLength);
446 }
447
448 static Boolean RPSUpdateOffsets(BlastAaLookupTable *lookup)
449 {
450 Uint4 len;
451 Int4 index;
452 Int4 num_used;
453 Int4 offset_diff;
454 AaLookupBackboneCell *bbc;
455 Int4 *ovf;
456
457 len = lookup->backbone_size;
458 offset_diff = lookup->word_length - 1;
459
460 // database assumes backbone type of lookup table
461 ASSERT(lookup->bone_type == eBackbone);
462 bbc = (AaLookupBackboneCell *)(lookup->thick_backbone);
463 ovf = (Int4 *)(lookup->overflow);
464
465 /* Walk through table, copying info into mod_lt[] */
466 for(index = 0; index < len; index++) {
467
468 if((num_used=bbc[index].num_used) <= 3)
469 {
470 while (num_used > 0)
471 {
472 num_used--;
473 bbc[index].payload.entries[num_used] += offset_diff;
474 }
475 }
476 else
477 {
478 while (num_used > 0)
479 {
480 num_used--;
481 ovf[ bbc[index].payload.overflow_cursor + num_used] += offset_diff;
482 }
483 }
484 }
485 return TRUE;
486 }
487
488
489 /* #define RPS_THRESHOLD 11 */
490 /* #define RPS_WORDSIZE 3 */
491
492 /* -- SSH --
493 Updates absolute pointers of the lookup table to relative pointers -
494 pointers relative to the start of "mod_lookup_table_memory" chunk
495 RPS Blast will calculate real pointers in run time using these values
496 */
497 Boolean RPSUpdatePointers(BlastAaLookupTable *lookup, Uint4 *new_overflow, Uint4 *new_overflow_size)
498 {
499 Uint4 len;
500 Int4 index;
501 Uint4 *start_address;
502 long mlpp_address;
503 Uint4 *new_overflow_cursor;
504 Int4 *src;
505 Int4 first_hit;
506 AaLookupBackboneCell *bbc;
507 Int4 *ovf;
508
509 // database assumes backbone type of lookup table
510 ASSERT(lookup->bone_type == eBackbone);
511 bbc = (AaLookupBackboneCell *)(lookup->thick_backbone);
512 ovf = (Int4 *)(lookup->overflow);
513
514 len = lookup->backbone_size;
515
516 start_address = new_overflow_cursor = new_overflow;
517
518 /* Walk through table, copying info into mod_lt[] */
519 for(index = 0; index < len; index++) {
520
521 if(bbc[index].num_used <= 3)
522 continue;
523
524 src = &(ovf[bbc[index].payload.overflow_cursor]);
525 MemCpy(new_overflow_cursor, &src[1], sizeof(Uint4)*(bbc[index].num_used-1));
526
527 mlpp_address = (long) new_overflow_cursor;
528
529 new_overflow_cursor += bbc[index].num_used-1;
530 first_hit = src[0];
531
532 mlpp_address -= (long) start_address;
533
534 /* Now this is new relative address - usually small */
535 bbc[index].payload.entries[1] = (Int4) mlpp_address;
536 bbc[index].payload.entries[0] = first_hit;
537
538 }
539
540 *new_overflow_size = new_overflow_cursor - new_overflow;
541
542 return TRUE;
543 }
544
545 /* -- SSH --
546 Write lookup table to the disk into file "*.loo", which will be
547 used memory-mapped during RPS Blast search
548 */
549 Boolean RPSDumpLookupTable(BlastAaLookupTable *lookup, FILE *fd)
550 {
551 Uint4 *new_overflow;
552 Uint4 new_overflow_size;
553 AaLookupBackboneCell empty_cell;
554 Int4 index;
555
556 RPSUpdateOffsets(lookup);
557
558 new_overflow = malloc(lookup->overflow_size*sizeof(Uint4));
559 RPSUpdatePointers(lookup, new_overflow, &new_overflow_size);
560
561 FileWrite(lookup->thick_backbone, sizeof(AaLookupBackboneCell), lookup->backbone_size, fd);
562
563 /* write empty cells out to the thick backbone size that
564 RPS blast expects */
565
566 memset(&empty_cell, 0, sizeof(empty_cell));
567 for (index = lookup->backbone_size; index < RPS_ARRAY_SIZE + 1; index++)
568 FileWrite(&empty_cell, sizeof(empty_cell), 1, fd);
569
570 if(new_overflow_size)
571 FileWrite(new_overflow,
572 sizeof(Uint4),
573 new_overflow_size,
574 fd);
575
576 sfree(new_overflow);
577
578 return TRUE;
579 }
580
581 /* Copied verbatim from algo/blast/core/blast_traceback.c */
582 void RPSPsiMatrixAttach(BlastScoreBlk* sbp, Int4** rps_pssm)
583 {
584 ASSERT(sbp);
585
586 /* Create a dummy PSI-BLAST matrix structure, only to then free it as we'd
587 * like to piggy back on the already created structure to use the gapped
588 * alignment routines */
589 sbp->psi_matrix = (SPsiBlastScoreMatrix*)
590 calloc(1, sizeof(SPsiBlastScoreMatrix));
591 ASSERT(sbp->psi_matrix);
592
593 sbp->psi_matrix->pssm = (SBlastScoreMatrix*)
594 calloc(1, sizeof(SBlastScoreMatrix));
595 ASSERT(sbp->psi_matrix->pssm);
596
597 /* The only data field that RPS-BLAST really needs */
598 sbp->psi_matrix->pssm->data = rps_pssm;
599 }
600
601 void RPSPsiMatrixDetach(BlastScoreBlk* sbp)
602 {
603 ASSERT(sbp);
604 sbp->psi_matrix->pssm->data = NULL;
605 sfree(sbp->psi_matrix->pssm);
606 sfree(sbp->psi_matrix);
607 }
608
609
610 /* -- SSH --
611 Create lookup table for the large sequence, that represented
612 by all collection of PSSM matrixes and dump this table to disk
613 Used by RPS Blast.
614 */
615 Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles,
616 Int4Ptr seqlens, CharPtr filename,
617 Nlm_FloatHi scalingFactor)
618 {
619 BlastScoreBlk *sbp;
620 FILE *fd;
621 Int4 **posMatrix;
622 Int4 start, i, header_size, all_length, magicNumber;
623 Int4Ptr offsets;
624 Int4 num_lookups;
625 BlastSeqLoc *lookup_segment=NULL;
626 BlastAaLookupTable *lookup;
627 LookupTableWrap* lookup_wrap_ptr=NULL;
628 LookupTableOptions* lookup_options;
629
630
631 if((fd = FileOpen(filename, "wb")) == NULL)
632 return FALSE;
633
634 num_lookups = 1; /* Single lookup table for all set */
635
636 all_length = seqlens[numProfiles] - seqlens[0];
637
638 posMatrix = MemNew((all_length + 1) * sizeof(Int4 *));
639 for (i = 0; i < all_length; i++) {
640 posMatrix[i] = (Int4 *) &(combinedMatrix[i][0]);
641 }
642
643 /* Last row is necessary */
644 posMatrix[all_length] = MemNew(sizeof(Int4) * PRO_ALPHABET_SIZE);
645
646 for(i = 0; i < PRO_ALPHABET_SIZE; i++) {
647 posMatrix[all_length][i] = -INT2_MAX;
648 }
649
650 sbp = BlastScoreBlkNew(BLASTAA_SEQ_CODE, 1);
651 RPSPsiMatrixAttach(sbp, posMatrix);
652 LookupTableOptionsNew(eBlastTypeBlastp, &lookup_options);
653 BLAST_FillLookupTableOptions(lookup_options, eBlastTypePsiBlast, FALSE,
654 (Int4) (myargs[3].floatvalue*scalingFactor), myargs[4].intvalue);
655
656
657 BlastSeqLocNew(&lookup_segment, 0, all_length);
658
659 /* Need query for psi-blast?? where to put the PSSM? */
660 LookupTableWrapInit(NULL, lookup_options, NULL, lookup_segment, sbp, &lookup_wrap_ptr, NULL, NULL);
661
662 RPSPsiMatrixDetach(sbp);
663 sbp = BlastScoreBlkFree(sbp);
664 lookup_options = LookupTableOptionsFree(lookup_options);
665 lookup_segment = BlastSeqLocFree(lookup_segment);
666
667 lookup = (BlastAaLookupTable*) lookup_wrap_ptr->lut;
668
669 /* Only Uint4 maximum length for lookup file allowed in current
670 implementation */
671 header_size = (numProfiles+1)*sizeof(Int4) + 8*sizeof(Int4);
672
673 /* Beginning of file will be allocated for lookup offsets */
674 fseek(fd, header_size, SEEK_SET);
675
676 offsets = MemNew(sizeof(Int4) * (num_lookups + 1));
677
678
679 offsets[0] = ftell(fd);
680
681 start = seqlens[0]; /* 0 */
682
683 RPSDumpLookupTable(lookup, fd);
684
685 i = 1;
686
687 offsets[i] = ftell(fd); /* Last offset also recorded */
688
689 fseek(fd, 0, SEEK_SET);
690 magicNumber = RPS_MAGIC_NUMBER;
691 FileWrite(&magicNumber, sizeof(Int4), 1, fd); /* header[0] */
692 FileWrite(&num_lookups, sizeof(Int4), 1, fd); /* header[1] */
693 FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[2] */
694 FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[3] */
695 FileWrite(&lookup->overflow_size, sizeof(Int4), 1, fd); /* header[4] */
696
697 /* Now writing recorded offsets in the beginning of the file */
698
699 fseek(fd, 8*sizeof(Int4), SEEK_SET);
700 FileWrite(offsets, sizeof(Int4), num_lookups + 1, fd);
701 FileClose(fd);
702
703 /* Final memory cleenup */
704
705 MemFree(posMatrix[all_length]);
706 MemFree(posMatrix);
707
708 return TRUE;
709 }
710
711 /* -- SSH --
712 Create file <database_name> (without extention), which is concatenation
713 of all FASTA files used. Used by RPS Blast.
714 */
715 Boolean RPSConcatSequences(FILE *sfp, CharPtr fastaname)
716 {
717 FILE *fasta_fp, *fd;
718 Char oneFileName[MAXLINELEN]; /*for reading one line per file*/
719 Char buffer[1024];
720 Int4 bytes;
721 CharPtr chptr, last_non_space;
722
723 if((fasta_fp = FileOpen(fastaname, "w")) == NULL) {
724 ErrPostEx(SEV_FATAL, 1, 0, "concatenate sequences: "
725 "Unable to open target fasta file %s: %s\n",
726 fastaname, strerror(errno));
727 return FALSE;
728 }
729
730 rewind(sfp);
731
732 while (fgets(oneFileName, MAXLINELEN, sfp)) {
733
734 /* Remove trailing whitespace */
735 last_non_space = NULL;
736 for(chptr = oneFileName; *chptr != NULLB; chptr++) {
737 if (!isspace(*chptr))
738 last_non_space = chptr;
739 }
740 if (last_non_space != NULL)
741 last_non_space[1] = NULLB;
742
743 if((fd = FileOpen(oneFileName, "r")) == NULL) {
744 ErrPostEx(SEV_FATAL, 1, 0, "concatenate sequences: "
745 "Unable to open source fasta file %s: %s\n",
746 oneFileName, strerror(errno));
747 FileClose(fasta_fp);
748 return FALSE;
749 }
750
751 /* Now concatenating this file into set */
752 while((bytes = FileRead(buffer, 1, 1024, fd)) > 0)
753 FileWrite(buffer, 1, bytes, fasta_fp);
754 FileClose(fd);
755 }
756
757 FileClose(fasta_fp);
758
759 return TRUE;
760 }
761
762 Int2 Main(void)
763
764 {
765
766 Char *profilesFileName; /*file name for list of profile file names*/
767 Char sequencesFileName[MAX_NAME_LENGTH]; /*file anme for list of sequence file names*/
768 Char matrixFileName[MAX_NAME_LENGTH]; /*file name for list of matrix file names*/
769 Char auxFileName[MAX_NAME_LENGTH]; /*file name for file containing auxiliary information*/
770 Char bigFileName[MAX_NAME_LENGTH]; /*file name to store byte-encoded coalesced matrix*/
771 Char lookupName[MAX_NAME_LENGTH]; /*file name to store precalculated lookup table */
772 FILE *auxiliaryfp; /*file descriptor for matrix auxiliary file*/
773 FILE *sequencesfp; /*files descriptor for file containing list of sequences*/
774 FILE *matrixnamefp; /*file descriptor for file containing matrix names*/
775 FILE *bigmatrixfile; /*file descriptor for file containing single big matrix*/
776 Int4 numProfiles; /*number of profiles*/
777 Int4 totalProfileLength; /*total length of all profiles*/
778 ScoreRow *combinedMatrix; /*combined matrix for all profiles*/
779 Char *directoryPrefix; /*directory where profile library is kept, used
780 to reach other directories indirectly*/
781
782 Int4Ptr seqlens;
783 Nlm_FloatHi scalingFactor; /*matrix scale to skip over in reading*/
784
785 if (! GetArgs ("copymatrices", NUMARG, myargs)) {
786 return (1);
787 }
788
789 if ((Boolean) myargs[1].intvalue) {
790 IMPALAPrintHelp(FALSE, 80, "copymat", stdout);
791 return(1);
792 }
793 profilesFileName = myargs[0].strvalue;
794 directoryPrefix = (Char *) MemNew(MAX_NAME_LENGTH *sizeof(char));
795 strcpy(directoryPrefix,profilesFileName);
796
797 impalaMakeFileNames(profilesFileName, auxFileName, bigFileName,
798 sequencesFileName, matrixFileName, NULL,
799 directoryPrefix);
800
801 if ((matrixnamefp = FileOpen(matrixFileName, "r")) == NULL) {
802 ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to open file with matrix file names %s\n", matrixFileName);
803 return (1);
804 }
805
806 if ((sequencesfp = FileOpen(sequencesFileName, "r")) == NULL) {
807 ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to open file with sequence file names %s\n", sequencesFileName);
808 return (1);
809 }
810
811 if ((auxiliaryfp = FileOpen(auxFileName, "r")) == NULL) {
812 ErrPostEx(SEV_FATAL, 1, 0, "profiles: Unable to open auxiliary file %s\n", auxFileName);
813 return (1);
814 }
815
816 /* -- SSH -- Name of matrix file depends on program - RPS or Impala */
817
818 if((Boolean) myargs[2].intvalue) {
819 sprintf(bigFileName, "%s.rps", profilesFileName);
820 }
821
822 if ((bigmatrixfile = FileOpen(bigFileName, "wb")) == NULL) {
823 ErrPostEx(SEV_FATAL, 1, 0, "rps-blast: Unable to open big matrix file %s\n", bigFileName);
824 return (1);
825 }
826
827 numProfiles = countProfiles(sequencesfp, matrixnamefp);
828 totalProfileLength = findTotalLength(auxiliaryfp, numProfiles,
829 &scalingFactor);
830
831 /* -- SSH -- Additional line in matrix with -INT2_MAX values */
832 if((Boolean) myargs[2].intvalue) {
833 totalProfileLength += numProfiles;
834 }
835
836 combinedMatrix = allocateMatrix(totalProfileLength);
837 if (NULL == combinedMatrix) {
838 ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to allocate matrix with%d rows\n", totalProfileLength);
839 return (1);
840
841 }
842 /* -- SSH -- RPS Blast data */
843 if ((Boolean) myargs[2].intvalue) {
844 seqlens = (Int4Ptr) MemNew((numProfiles +1) * sizeof(Int4));
845 } else {
846 seqlens = NULL;
847 }
848
849 readAllMatrices(matrixnamefp, combinedMatrix, numProfiles,
850 directoryPrefix, seqlens);
851
852 /* -- SSH -- For RPS Blast additional info will be added to the file */
853 if ((Boolean) myargs[2].intvalue) {
854 Int4 magicNumber = RPS_MAGIC_NUMBER;
855 FileWrite(&magicNumber, sizeof(Int4), 1, bigmatrixfile);
856 FileWrite(&numProfiles, sizeof(Int4), 1, bigmatrixfile);
857 FileWrite(seqlens, sizeof(Int4), numProfiles + 1, bigmatrixfile);
858
859 sprintf(lookupName, "%s.loo", profilesFileName);
860 RPSCreateLookupFile(combinedMatrix, numProfiles, seqlens, lookupName,
861 scalingFactor);
862
863 if(!RPSConcatSequences(sequencesfp, profilesFileName)) {
864 ErrPostEx(SEV_ERROR, 0,0, "Failure to concatenate sequences");
865 return 1;
866 }
867
868 }
869
870 FileWrite((void *) combinedMatrix[0], sizeof(ScoreRow),
871 (size_t) totalProfileLength, bigmatrixfile);
872 freeMatrix(combinedMatrix);
873 FileClose(bigmatrixfile);
874 FileClose(matrixnamefp);
875 FileClose(sequencesfp);
876 FileClose(auxiliaryfp);
877 return 0;
878 }
879 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |