NCBI C Toolkit Cross Reference

C/demo/copymat.c


  1 static char const rcsid[] = "$Id: copymat.c,v 6.49 2008/11/04 16:44:38 maning Exp $";
  2 
  3 /*
  4 * ===========================================================================
  5 *
  6 *                            PUBLIC DOMAIN NOTICE
  7 *               National Center for Biotechnology Information
  8 *
  9 *  This software/database is a "United States Government Work" under the
 10 *  terms of the United States Copyright Act.  It was written as part of
 11 *  the author's official duties as a United States Government employee and
 12 *  thus cannot be copyrighted.  This software/database is freely available
 13 *  to the public for use. The National Library of Medicine and the U.S.
 14 *  Government have not placed any restriction on its use or reproduction.
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 *  Please cite the author in any work or product based on this material.
 25 *
 26 * ===========================================================================
 27 */
 28 
 29 /*****************************************************************************
 30 
 31 File name: copymat.c
 32 
 33 Authors: Alejandro Schaffer, Sergei Shavirin
 34 
 35 Contents: main routines for copymatrices program to convert
 36 score matrices output by makematrices into a single byte-encoded file.
 37    
 38 $Log: copymat.c,v $
 39 Revision 6.49  2008/11/04 16:44:38  maning
 40 add type cast to fix compilation error
 41 
 42 Revision 6.48  2008/02/01 14:04:25  madden
 43 LookupTableWrapInit prototype change
 44 
 45 Revision 6.47  2006/11/24 19:06:15  kans
 46 added include of blast_filter.h
 47 
 48 Revision 6.46  2006/11/21 17:24:20  papadopo
 49 1. rearrange headers
 50 2. change lookup table type
 51 
 52 Revision 6.45  2006/09/15 15:45:35  madden
 53 Change to LookupTableWrapInit
 54 
 55 Revision 6.44  2005/12/22 14:22:19  papadopo
 56 change signature of BLAST_FillLookupTableOptions
 57 
 58 Revision 6.43  2005/12/20 15:36:39  papadopo
 59 change name of structure field
 60 
 61 Revision 6.42  2005/05/20 18:57:51  camacho
 62 Update to use new signature to BLAST_FillLookupTableOptions
 63 
 64 Revision 6.41  2005/02/14 14:11:55  camacho
 65 Changes to use SBlastScoreMatrix
 66 
 67 Revision 6.40  2005/01/10 13:48:20  madden
 68 Change to BLAST_FillInitialWordOptions prototype
 69 
 70 Revision 6.39  2004/09/15 17:40:21  papadopo
 71 change use of ListNode to use of BlastSeqLoc for lookup table creation
 72 
 73 Revision 6.38  2004/07/12 16:30:44  papadopo
 74 LookupTable->BlastLookupTable
 75 
 76 Revision 6.37  2004/06/22 16:45:56  camacho
 77 Changed the blast_type_* definitions for the EBlastProgramType enumeration from
 78 algo/blast.
 79 
 80 Revision 6.36  2004/04/23 21:11:31  papadopo
 81 force the thick backbone (dumped to the RPS .loo file) to contain the number of cells assumed by RPS blast
 82 
 83 Revision 6.35  2004/04/16 14:48:04  papadopo
 84 remove unneeded argument to FillLookupTableOptions
 85 
 86 Revision 6.34  2004/04/07 21:48:48  camacho
 87 Add missing header file
 88 
 89 Revision 6.33  2004/04/06 12:15:44  camacho
 90 Rename DoubleInt -> SSeqRange
 91 
 92 Revision 6.32  2004/03/10 20:21:27  papadopo
 93 add (unused) RPS blast parameters to FillLookupTableOptions
 94 
 95 Revision 6.31  2004/03/04 21:16:10  papadopo
 96 add (unused) RPS blast parameter to FillLookupTable call
 97 
 98 Revision 6.30  2004/01/30 20:34:45  coulouri
 99 fix minor nit to FileWrite call
100 
101 Revision 6.29  2004/01/26 19:40:48  coulouri
102 * Correct buffer overrun
103 * Use offset rather than pointer in LookupBackboneCell
104 
105 Revision 6.28  2003/11/24 18:18:47  coulouri
106 Correction to previous fix for 64-bit irix
107 
108 Revision 6.27  2003/11/21 18:01:15  ivanov
109 Added extern definition for impalaMakeFileNames()
110 
111 Revision 6.26  2003/11/20 15:44:32  camacho
112 Tom Madden's changes to use lookup table contruction code from algo/blast.
113 
114 Revision 6.25  2003/05/30 17:31:09  coulouri
115 add rcsid
116 
117 Revision 6.24  2003/05/13 16:02:42  coulouri
118 make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
119 
120 Revision 6.23  2002/11/06 21:26:47  ucko
121 RPSConcatSequences: provide useful error messages, ignore all trailing space.
122 
123 Revision 6.22  2002/04/08 19:02:31  madden
124 Allow float for threshold
125 
126 Revision 6.21  2001/06/07 16:45:08  shavirin
127 Removed bug related to 64bit address structure on SGI platform.
128 
129 Revision 6.20  2001/04/12 19:50:12  madden
130 Comment out unrescaling of matrix
131 
132 Revision 6.19  2000/11/14 23:17:52  shavirin
133 Removed serious bug under NT platform related to diffence in "w" and "wb"
134 flag when opening file on PC NT computer. Removed unused header files.
135 
136 Revision 6.18  2000/11/13 21:25:22  shavirin
137 Fixed possible bug in the function RPSUpdatePointers (64 bit architecture
138 specific).
139 
140 Revision 6.17  2000/11/08 18:34:19  kans
141 commented out UNIX-specific headers, included by ncbilcl.h for UNIX anyway
142 
143 Revision 6.16  2000/10/20 21:46:37  shavirin
144 Added additional parameters for creating RPS database.
145 
146 Revision 6.15  2000/02/29 16:27:39  shavirin
147 Added protection against matrix with scaleFactor != 1 for RPS Blast
148 
149 Revision 6.14  2000/02/28 21:08:34  shavirin
150 This fixes DEC Alpha problems of RPS Blast.
151 
152 Revision 6.13  2000/02/28 19:06:47  shavirin
153 Added comments for RPS Blast functions.
154 Removed unused code.
155 
156 Revision 6.12  2000/02/22 19:29:06  shavirin
157 Fixed DEC Alpha specific bug in the function RPSCreateLookupFile().
158 
159 Revision 6.11  2000/02/17 19:11:15  shavirin
160 Removed reference to theCacheSize.
161 
162 Revision 6.10  2000/01/13 15:27:10  shavirin
163 Added concatenation of files into single file (for later formatdb).
164 
165 Revision 6.9  2000/01/12 14:39:46  shavirin
166 Added parameter to set cache size in lookup table foe RPS Blast.
167 
168 Revision 6.8  2000/01/07 22:31:47  shavirin
169 Lookup table header now has notice, that this is single table.
170 
171 Revision 6.7  1999/12/30 18:34:20  shavirin
172 Last row in the matrix for every sequence will be gap-row (-INT2_MAX)
173 
174 Revision 6.6  1999/12/29 18:49:29  shavirin
175 Changed a little format of RPS lookup tables file.
176 
177 
178 *****************************************************************************/
179 
180 
181 #include <ncbi.h>
182 #include <sequtil.h>
183 #include <seqport.h>
184 #include <tofasta.h>
185 #include <algo/blast/core/blast_aalookup.h>
186 #include <algo/blast/core/blast_stat.h>
187 #include <algo/blast/core/blast_encoding.h>
188 #include <algo/blast/core/lookup_wrap.h>
189 #include <algo/blast/core/blast_filter.h>
190 
191 #ifndef MAXLINELEN
192 #   define MAXLINELEN 2000
193 #endif
194 #ifndef MAX_NAME_LENGTH
195 #   define MAX_NAME_LENGTH 500
196 #endif
197 #ifndef PRO_ALPHABET_SIZE
198 #   define PRO_ALPHABET_SIZE  26
199 #endif
200 #ifndef SORT_THRESHOLD
201 #   define SORT_THRESHOLD 20
202 #endif
203 #ifndef RPS_MAGIC_NUMBER
204 #   define RPS_MAGIC_NUMBER 7702
205 #endif
206 #ifndef RPS_ARRAY_SIZE
207 #   define RPS_ARRAY_SIZE 32768
208 #endif
209 /*factor used to multiply the gapped K parameter to make it
210   more accurate in most cases*/
211 #ifndef PRO_K_MULTIPLIER
212 #   define PRO_K_MULTIPLIER 1.2
213 #endif
214 #include <algo/blast/core/blast_lookup.h>
215 #include <algo/blast/core/blast_options.h>
216 
217 typedef Int4 ScoreRow[PRO_ALPHABET_SIZE];
218 extern Boolean LIBCALL 
219 IMPALAPrintHelp PROTO((Boolean html, Int4 line_length, Char * programName, 
220                        FILE *outfp));
221 extern void  LIBCALL
222 impalaMakeFileNames PROTO((Char * matrixDbName, Char * auxiliaryFileName,
223                            Char * mmapFileName, Char * seqFileName,
224                            Char *matrixFileName, Char * ckptFileName,
225                            Char *directoryPrefix));
226 
227 #define NUMARG (sizeof(myargs)/sizeof(myargs[0]))
228 
229 static Args myargs [] = {
230     { "Database for matrix profiles", /* 0 */
231       "stdin", NULL, NULL, FALSE, 'P', ARG_FILE_IN, 0.0, 0, NULL},
232     { "Print help; overrides all other arguments", /* 1 */
233       "F", NULL, NULL, FALSE, 'H', ARG_BOOLEAN, 0.0, 0, NULL},
234     { "Create RPS mem map file(s)", /* 2 */
235       "T", NULL, NULL, FALSE, 'r', ARG_BOOLEAN, 0.0, 0, NULL},
236     { "Threshold for extending hits for RPS database", /* 3 */
237       "11", NULL, NULL, FALSE, 'f', ARG_FLOAT, 0.0, 0, NULL},
238     { "Word size for RPS database", /* 4 */
239       "3", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL},
240 };
241 
242 /*counts the number of items in sequencesFile and matricesFile, assumed to
243   be one per line, and checks that the numbers are equal.
244   returns the number if equal, 0 if unequal, rewinds the file descriptors
245   before returning*/
246 static Int4 countProfiles(FILE *sequencesFile, FILE *matricesFile)
247 {
248     Int4 sequencesCount = 0; /*count for sequencesFile*/
249     Int4 matricesCount = 0; /*count for matricesFile*/
250     Char oneFileName[MAXLINELEN]; /*for reading one line per file*/
251     
252     while (fgets(oneFileName,MAXLINELEN,sequencesFile))
253         sequencesCount++;
254     while (fgets(oneFileName,MAXLINELEN,matricesFile))
255         matricesCount++;
256     rewind(matricesFile);
257     rewind(sequencesFile);
258     if (sequencesCount == matricesCount)
259         return(sequencesCount);
260     else {
261         ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Sequences file has %d entries; Matrices file has %d entries; these should be equal\n", sequencesCount,matricesCount);
262         return(0);
263     }
264 }
265 
266 /*free the memory associated with the position-specific score matrices*/
267 static void  freeMatrix(ScoreRow *posMatrix)
268 {
269 
270   MemFree(posMatrix);
271 }
272 
273 /*allocate memory for the position-specific score matrices
274   enough memory is allocated to hold the largest matrix
275   the memory is reused for each different matrix*/
276 static ScoreRow * allocateMatrix(Int4 maxSequenceLength)
277 {
278   ScoreRow *returnMatrix; /*matrix to return*/
279 
280   returnMatrix = (ScoreRow *) MemNew(maxSequenceLength * sizeof(ScoreRow));
281   return(returnMatrix);
282 }
283 
284 /* read in a position-specific score matrix from thisMatrixFile
285    the number of positions is dbSequenceLength
286    kbp keeps the Karlin-ALtschul parameters
287    returnMatrix is the memory address where the matrix is to be stored*/
288 static void readNextMatrix(FILE * thisMatrixFile,
289               Int4 startPos, Int4 *endPos,
290               ScoreRow *bigMatrix)
291 {
292   Int4 i, r; /*row indices for sequence and matrix*/
293   Int4 lengthInFile; /*length of query*/
294   Nlm_FloatHi junkLambda, junkK, junklogK, junkH; /*used to read in useless
295                                                     Karlin blocks*/
296   Char *sequence;  /*sequence to read in*/
297   Char rowOfScores[MAXLINELEN]; /*one row of scores to be read in*/
298 
299   fscanf(thisMatrixFile, "%d", &lengthInFile);
300   sequence = (Char *) MemNew((lengthInFile + 2) * sizeof(Char));
301   fscanf(thisMatrixFile,"%s",sequence);
302   MemFree(sequence);
303   /*read in useless Karlin block*/
304   fscanf(thisMatrixFile,"%le", &junkLambda);
305   fscanf(thisMatrixFile,"%le", &junkK);
306   fscanf(thisMatrixFile,"%le", &junklogK);
307   fscanf(thisMatrixFile,"%le", &junkH);
308   /*read in useless Karlin block*/
309   fscanf(thisMatrixFile,"%le", &junkLambda);
310   fscanf(thisMatrixFile,"%le", &junkK);
311   fscanf(thisMatrixFile,"%le", &junklogK);
312   fscanf(thisMatrixFile,"%le", &junkH);
313   /*read in useless Karlin block*/
314   fscanf(thisMatrixFile,"%le", &junkLambda);
315   fscanf(thisMatrixFile,"%le", &junkK);
316   fscanf(thisMatrixFile,"%le", &junklogK);
317   fscanf(thisMatrixFile,"%le\n", &junkH);
318   for(i = 0, r = startPos; i < lengthInFile; i++, r++) {
319     fgets(rowOfScores, MAXLINELEN, thisMatrixFile);
320     sscanf(rowOfScores, "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d", 
321                         &(bigMatrix[r][0]),
322                         &(bigMatrix[r][1]),
323                         &(bigMatrix[r][2]),
324                         &(bigMatrix[r][3]),
325                         &(bigMatrix[r][4]),
326                         &(bigMatrix[r][5]),
327                         &(bigMatrix[r][6]),
328                         &(bigMatrix[r][7]),
329                         &(bigMatrix[r][8]),
330                         &(bigMatrix[r][9]),
331                         &(bigMatrix[r][10]),
332                         &(bigMatrix[r][11]),
333                         &(bigMatrix[r][12]),
334                         &(bigMatrix[r][13]),
335                         &(bigMatrix[r][14]),
336                         &(bigMatrix[r][15]),
337                         &(bigMatrix[r][16]),
338                         &(bigMatrix[r][17]),
339                         &(bigMatrix[r][18]),
340                         &(bigMatrix[r][19]),
341                         &(bigMatrix[r][20]),
342                         &(bigMatrix[r][21]),
343                         &(bigMatrix[r][22]),
344                         &(bigMatrix[r][23]),
345                         &(bigMatrix[r][24]),
346                         &(bigMatrix[r][25]));
347   }
348 
349   if((Boolean) myargs[2].intvalue) {
350       /* Last row in the matrix will be gap-row (-INT2_MAX) */
351       
352       for(i = 0; i < 26; i++) {
353           bigMatrix[r][i] = -INT2_MAX;
354       }
355       r++;
356   }
357 
358   *endPos = r;
359 }
360 
361 /*read each matrix in turn and store its scores in combinedMatrix*/
362 static void readAllMatrices(FILE *matrixnamefp, ScoreRow *combinedMatrix, 
363                             Int4 numProfiles, CharPtr directoryPrefix, 
364                             Int4Ptr seqlens)
365 {
366     Int4 i; /*loop index*/
367     Char oneMatrixFileName[MAXLINELEN]; /*name of matrix file to read*/
368     FILE *thisMatrixFile; /*descriptor for one matrix file*/
369     Int4 startPos; /*starting row in big matrix for next small matrix*/
370     Int4 endPos; /*ending row + 1 in big matrix for this small matrix*/
371     Int4 prefixLength; /*length of directoryPrefix*/
372     Int4 c1,c2; /*loop indices over characters*/
373     Char relativeMatrixFileName[MAXLINELEN];
374     
375     startPos = 0;
376     endPos = 0;
377     if ('\0' != directoryPrefix[0]) {
378         strcpy(oneMatrixFileName, directoryPrefix);
379         prefixLength = strlen(directoryPrefix);
380     }     
381     for (i = 0; i < numProfiles; i++) {
382         if ('\0' == directoryPrefix[0])
383             fscanf(matrixnamefp,"%s", oneMatrixFileName);
384         else {
385             fscanf(matrixnamefp,"%s", relativeMatrixFileName); 
386             for(c1 = prefixLength, c2 = 0; relativeMatrixFileName[c2] != '\0';
387                 c1++, c2++)
388                 oneMatrixFileName[c1] = relativeMatrixFileName[c2];
389             oneMatrixFileName[c1] = '\0';
390         }
391         
392         if ((thisMatrixFile = FileOpen(oneMatrixFileName, "r")) == NULL)  {
393             ErrPostEx(SEV_FATAL, 1, 0, "profiles: Unable to open matrix file %s\n", oneMatrixFileName);
394             return;
395         }
396         readNextMatrix(thisMatrixFile, startPos, &endPos,
397                        combinedMatrix);
398 
399         if(seqlens != NULL) {
400             seqlens[i] = startPos;
401         }
402 
403         startPos = endPos;
404         FileClose(thisMatrixFile);
405     }
406 
407     if(seqlens != NULL) {   /* Last entry - is the end of last sequence */
408         seqlens[i] = startPos;
409     }
410     
411     return;
412 }
413 
414 /*findTotalLength scans matrixAuxiliaryFile to find the
415   total  number of positions among all the position-specific matrices*/
416 static Int4 findTotalLength(FILE *matrixAuxiliaryFile, Int4 numProfiles,
417                             Nlm_FloatHiPtr scalingFactor)
418 {
419     Int4 maxLength; /*maximum length of sequence*/
420     Int4 thisLength; /*length of next sequence*/
421     Int4 totalLength; /*total length to return*/
422     Int4 dbLength; /*length of database*/
423     Int4 i; /*loop index*/
424     Nlm_FloatHi Kungapped, Hungapped; /*two values to read*/
425     Char * underlyingMatrixName; /*name of matrix to read*/
426     Int4 gap_open, gap_extend; /*gap costs to skip over in reading*/
427     
428     underlyingMatrixName = MemNew(MAXLINELEN * sizeof(Char));
429     fscanf(matrixAuxiliaryFile,"%s",underlyingMatrixName);
430     fscanf(matrixAuxiliaryFile,"%d\n", &gap_open);
431     fscanf(matrixAuxiliaryFile,"%d\n", &gap_extend);
432     fscanf(matrixAuxiliaryFile, "%le", &Kungapped);
433     fscanf(matrixAuxiliaryFile, "%le", &Hungapped);
434     fscanf(matrixAuxiliaryFile, "%d", &maxLength);
435     fscanf(matrixAuxiliaryFile, "%d", &dbLength);
436     fscanf(matrixAuxiliaryFile, "%lf", scalingFactor);
437     totalLength = 0;
438     for (i = 0; i < numProfiles; i++) {
439         fscanf(matrixAuxiliaryFile, "%d", &thisLength);
440         fscanf(matrixAuxiliaryFile, "%le", &Kungapped);
441         totalLength += thisLength;
442     }
443     rewind(matrixAuxiliaryFile);
444     MemFree(underlyingMatrixName);
445     return(totalLength);
446 }
447 
448 static Boolean RPSUpdateOffsets(BlastAaLookupTable *lookup)
449 {
450     Uint4 len;
451     Int4 index;
452     Int4 num_used;
453     Int4 offset_diff;
454     AaLookupBackboneCell *bbc;
455     Int4 *ovf;
456 
457     len = lookup->backbone_size;
458     offset_diff = lookup->word_length - 1;
459 
460     // database assumes backbone type of lookup table
461     ASSERT(lookup->bone_type == eBackbone);
462     bbc = (AaLookupBackboneCell *)(lookup->thick_backbone);
463     ovf = (Int4 *)(lookup->overflow);
464 
465     /* Walk through table, copying info into mod_lt[] */
466     for(index = 0; index < len; index++) {
467         
468         if((num_used=bbc[index].num_used) <= 3)
469         {
470             while (num_used > 0)
471             {
472                 num_used--;
473                 bbc[index].payload.entries[num_used] += offset_diff;
474             }
475         }
476         else
477         {
478             while (num_used > 0)
479             {
480                  num_used--;
481                  ovf[ bbc[index].payload.overflow_cursor + num_used] += offset_diff;
482             }
483         }
484     }
485     return TRUE;
486 }
487 
488 
489 /* #define RPS_THRESHOLD 11 */
490 /* #define RPS_WORDSIZE  3 */
491 
492 /* -- SSH --
493    Updates absolute pointers of the lookup table to relative pointers -
494    pointers relative to the start of "mod_lookup_table_memory" chunk 
495    RPS Blast will calculate real pointers in run time using these values
496 */
497 Boolean RPSUpdatePointers(BlastAaLookupTable *lookup, Uint4 *new_overflow, Uint4 *new_overflow_size)
498 {
499     Uint4 len;
500     Int4 index;
501     Uint4 *start_address;
502     long mlpp_address;
503     Uint4 *new_overflow_cursor;
504     Int4 *src;
505     Int4 first_hit;
506     AaLookupBackboneCell *bbc;
507     Int4 *ovf;
508 
509     // database assumes backbone type of lookup table
510     ASSERT(lookup->bone_type == eBackbone);
511     bbc = (AaLookupBackboneCell *)(lookup->thick_backbone);
512     ovf = (Int4 *)(lookup->overflow);
513 
514     len = lookup->backbone_size;
515 
516     start_address = new_overflow_cursor = new_overflow;
517 
518     /* Walk through table, copying info into mod_lt[] */
519     for(index = 0; index < len; index++) {
520         
521         if(bbc[index].num_used <= 3)
522             continue;
523 
524         src = &(ovf[bbc[index].payload.overflow_cursor]);
525         MemCpy(new_overflow_cursor, &src[1], sizeof(Uint4)*(bbc[index].num_used-1));
526 
527         mlpp_address = (long) new_overflow_cursor;
528 
529         new_overflow_cursor += bbc[index].num_used-1;
530         first_hit = src[0];
531 
532         mlpp_address -= (long) start_address;
533         
534         /* Now this is new relative address - usually small  */
535         bbc[index].payload.entries[1] = (Int4) mlpp_address;
536         bbc[index].payload.entries[0] = first_hit;
537 
538     }
539 
540     *new_overflow_size = new_overflow_cursor - new_overflow;
541 
542     return TRUE;
543 }
544 
545 /* -- SSH --
546    Write lookup table to the disk into file "*.loo", which will be
547    used memory-mapped during RPS Blast search 
548 */
549 Boolean RPSDumpLookupTable(BlastAaLookupTable *lookup, FILE *fd)
550 {
551     Uint4 *new_overflow;
552     Uint4 new_overflow_size;
553     AaLookupBackboneCell empty_cell;
554     Int4 index;
555 
556     RPSUpdateOffsets(lookup);
557 
558     new_overflow = malloc(lookup->overflow_size*sizeof(Uint4)); 
559     RPSUpdatePointers(lookup, new_overflow, &new_overflow_size);
560 
561     FileWrite(lookup->thick_backbone, sizeof(AaLookupBackboneCell), lookup->backbone_size, fd);
562     
563     /* write empty cells out to the thick backbone size that
564        RPS blast expects */
565 
566     memset(&empty_cell, 0, sizeof(empty_cell));
567     for (index = lookup->backbone_size; index < RPS_ARRAY_SIZE + 1; index++)
568         FileWrite(&empty_cell, sizeof(empty_cell), 1, fd);
569 
570     if(new_overflow_size)
571         FileWrite(new_overflow,
572                   sizeof(Uint4),
573                   new_overflow_size,
574                   fd);
575 
576     sfree(new_overflow);
577     
578     return TRUE;
579 }
580 
581 /* Copied verbatim from algo/blast/core/blast_traceback.c */
582 void RPSPsiMatrixAttach(BlastScoreBlk* sbp, Int4** rps_pssm)
583 {
584     ASSERT(sbp);
585 
586     /* Create a dummy PSI-BLAST matrix structure, only to then free it as we'd
587      * like to piggy back on the already created structure to use the gapped
588      * alignment routines */
589     sbp->psi_matrix = (SPsiBlastScoreMatrix*) 
590         calloc(1, sizeof(SPsiBlastScoreMatrix));
591     ASSERT(sbp->psi_matrix);
592 
593     sbp->psi_matrix->pssm = (SBlastScoreMatrix*)
594         calloc(1, sizeof(SBlastScoreMatrix));
595     ASSERT(sbp->psi_matrix->pssm);
596 
597     /* The only data field that RPS-BLAST really needs */
598     sbp->psi_matrix->pssm->data = rps_pssm;
599 }
600 
601 void RPSPsiMatrixDetach(BlastScoreBlk* sbp)
602 {
603     ASSERT(sbp);
604     sbp->psi_matrix->pssm->data = NULL;
605     sfree(sbp->psi_matrix->pssm);
606     sfree(sbp->psi_matrix);
607 }
608 
609 
610 /* -- SSH --
611    Create lookup table for the large sequence, that represented
612    by all collection of PSSM matrixes and dump this table to disk
613    Used by RPS Blast.
614 */
615 Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles,
616                             Int4Ptr seqlens, CharPtr filename, 
617                             Nlm_FloatHi scalingFactor)
618 {
619     BlastScoreBlk *sbp;
620     FILE *fd;
621     Int4  **posMatrix;
622     Int4 start, i, header_size, all_length, magicNumber;
623     Int4Ptr offsets;
624     Int4 num_lookups;
625     BlastSeqLoc *lookup_segment=NULL;
626     BlastAaLookupTable *lookup;
627     LookupTableWrap* lookup_wrap_ptr=NULL;
628     LookupTableOptions* lookup_options;
629    
630 
631     if((fd = FileOpen(filename, "wb")) == NULL)
632         return FALSE;
633     
634     num_lookups = 1; /* Single lookup table for all set */
635 
636     all_length = seqlens[numProfiles] - seqlens[0];
637     
638     posMatrix = MemNew((all_length + 1) * sizeof(Int4 *));
639     for (i = 0; i < all_length; i++) {
640         posMatrix[i] = (Int4 *) &(combinedMatrix[i][0]);
641     }
642     
643     /* Last row is necessary */
644     posMatrix[all_length] = MemNew(sizeof(Int4) * PRO_ALPHABET_SIZE);
645 
646     for(i = 0; i < PRO_ALPHABET_SIZE; i++) {
647         posMatrix[all_length][i] = -INT2_MAX;
648     }
649 
650     sbp = BlastScoreBlkNew(BLASTAA_SEQ_CODE, 1);
651     RPSPsiMatrixAttach(sbp, posMatrix);
652     LookupTableOptionsNew(eBlastTypeBlastp, &lookup_options);
653     BLAST_FillLookupTableOptions(lookup_options, eBlastTypePsiBlast, FALSE, 
654         (Int4) (myargs[3].floatvalue*scalingFactor), myargs[4].intvalue);
655 
656 
657     BlastSeqLocNew(&lookup_segment, 0, all_length);
658 
659     /* Need query for psi-blast??  where to put the PSSM? */
660     LookupTableWrapInit(NULL, lookup_options, NULL, lookup_segment, sbp, &lookup_wrap_ptr, NULL, NULL);
661    
662     RPSPsiMatrixDetach(sbp);
663     sbp = BlastScoreBlkFree(sbp);
664     lookup_options = LookupTableOptionsFree(lookup_options);
665     lookup_segment = BlastSeqLocFree(lookup_segment);
666 
667     lookup = (BlastAaLookupTable*) lookup_wrap_ptr->lut;
668 
669     /* Only Uint4 maximum length for lookup file allowed in current
670        implementation */
671     header_size = (numProfiles+1)*sizeof(Int4) + 8*sizeof(Int4);
672     
673     /* Beginning of file will be allocated for lookup offsets */
674     fseek(fd, header_size, SEEK_SET);
675     
676     offsets = MemNew(sizeof(Int4) * (num_lookups + 1));
677     
678 
679     offsets[0] = ftell(fd);
680     
681     start = seqlens[0]; /* 0 */
682     
683     RPSDumpLookupTable(lookup, fd);
684     
685     i = 1;
686     
687     offsets[i] = ftell(fd); /* Last offset also recorded */
688     
689     fseek(fd, 0, SEEK_SET);
690     magicNumber = RPS_MAGIC_NUMBER;
691     FileWrite(&magicNumber, sizeof(Int4), 1, fd); /* header[0] */
692     FileWrite(&num_lookups, sizeof(Int4), 1, fd); /* header[1] */
693     FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[2] */
694     FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[3] */
695     FileWrite(&lookup->overflow_size, sizeof(Int4), 1, fd); /* header[4] */
696     
697     /* Now writing recorded offsets in the beginning of the file */
698     
699     fseek(fd, 8*sizeof(Int4), SEEK_SET);
700     FileWrite(offsets, sizeof(Int4), num_lookups + 1, fd);
701     FileClose(fd);
702     
703     /* Final memory cleenup */
704     
705     MemFree(posMatrix[all_length]);
706     MemFree(posMatrix);
707 
708     return TRUE;
709 }
710 
711 /* -- SSH --
712    Create file <database_name> (without extention), which is concatenation
713    of all FASTA files used. Used by RPS Blast.
714 */
715 Boolean RPSConcatSequences(FILE *sfp, CharPtr fastaname)
716 {
717     FILE *fasta_fp, *fd;
718     Char oneFileName[MAXLINELEN]; /*for reading one line per file*/
719     Char buffer[1024];
720     Int4 bytes;
721     CharPtr chptr, last_non_space;
722 
723     if((fasta_fp = FileOpen(fastaname, "w")) == NULL) {
724         ErrPostEx(SEV_FATAL, 1, 0, "concatenate sequences: "
725                   "Unable to open target fasta file %s: %s\n",
726                   fastaname, strerror(errno));
727         return FALSE;
728     }
729 
730     rewind(sfp);
731     
732     while (fgets(oneFileName, MAXLINELEN, sfp)) {
733 
734         /* Remove trailing whitespace */
735         last_non_space = NULL;
736         for(chptr = oneFileName; *chptr != NULLB; chptr++) {
737             if (!isspace(*chptr))
738                 last_non_space = chptr;
739         }
740         if (last_non_space != NULL)
741             last_non_space[1] = NULLB;
742         
743         if((fd = FileOpen(oneFileName, "r")) == NULL) {
744             ErrPostEx(SEV_FATAL, 1, 0, "concatenate sequences: "
745                       "Unable to open source fasta file %s: %s\n",
746                       oneFileName, strerror(errno));
747             FileClose(fasta_fp);
748             return FALSE;
749         }
750         
751         /* Now concatenating this file into set */
752         while((bytes = FileRead(buffer, 1, 1024, fd)) > 0)
753             FileWrite(buffer, 1, bytes, fasta_fp);
754         FileClose(fd);
755     }
756     
757     FileClose(fasta_fp);
758     
759     return TRUE;
760 }
761 
762 Int2  Main(void)
763 
764 {
765     
766     Char *profilesFileName; /*file name for list of profile file names*/
767     Char sequencesFileName[MAX_NAME_LENGTH]; /*file anme for list of sequence file names*/
768     Char matrixFileName[MAX_NAME_LENGTH]; /*file name for list of matrix file names*/
769     Char auxFileName[MAX_NAME_LENGTH]; /*file name for file containing auxiliary information*/
770     Char bigFileName[MAX_NAME_LENGTH]; /*file name to store byte-encoded coalesced matrix*/
771     Char lookupName[MAX_NAME_LENGTH]; /*file name to store precalculated lookup table */
772     FILE *auxiliaryfp; /*file descriptor for matrix auxiliary file*/
773     FILE *sequencesfp; /*files descriptor for file containing list of sequences*/
774     FILE *matrixnamefp; /*file descriptor for file containing matrix names*/
775     FILE *bigmatrixfile; /*file descriptor for file containing single big matrix*/
776     Int4 numProfiles; /*number of profiles*/
777     Int4 totalProfileLength; /*total length of all profiles*/
778     ScoreRow *combinedMatrix; /*combined matrix for all profiles*/
779     Char *directoryPrefix; /*directory where profile library is kept, used
780                              to reach other directories indirectly*/
781 
782     Int4Ptr seqlens;
783     Nlm_FloatHi scalingFactor; /*matrix scale to skip over in reading*/
784 
785     if (! GetArgs ("copymatrices", NUMARG, myargs)) {
786         return (1);
787     }
788     
789     if ((Boolean) myargs[1].intvalue) {
790         IMPALAPrintHelp(FALSE, 80, "copymat", stdout);
791         return(1);
792     }
793     profilesFileName = myargs[0].strvalue;
794     directoryPrefix = (Char *) MemNew(MAX_NAME_LENGTH *sizeof(char));
795     strcpy(directoryPrefix,profilesFileName);
796     
797     impalaMakeFileNames(profilesFileName, auxFileName, bigFileName,
798                         sequencesFileName, matrixFileName, NULL, 
799                         directoryPrefix);
800     
801     if ((matrixnamefp = FileOpen(matrixFileName, "r")) == NULL) {
802         ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to open file with matrix file names %s\n", matrixFileName);
803         return (1);
804     }
805     
806     if ((sequencesfp = FileOpen(sequencesFileName, "r")) == NULL) {
807         ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to open file with sequence file names %s\n", sequencesFileName);
808         return (1);
809     }
810     
811     if ((auxiliaryfp = FileOpen(auxFileName, "r")) == NULL) {
812         ErrPostEx(SEV_FATAL, 1, 0, "profiles: Unable to open auxiliary file %s\n", auxFileName);
813         return (1);
814     }
815 
816     /* -- SSH -- Name of matrix file depends on program - RPS or Impala */
817     
818     if((Boolean) myargs[2].intvalue) {
819         sprintf(bigFileName, "%s.rps", profilesFileName);
820     }
821     
822     if ((bigmatrixfile = FileOpen(bigFileName, "wb")) == NULL) {
823         ErrPostEx(SEV_FATAL, 1, 0, "rps-blast: Unable to open big matrix file %s\n", bigFileName);
824         return (1);
825     }
826     
827     numProfiles =  countProfiles(sequencesfp, matrixnamefp);
828     totalProfileLength = findTotalLength(auxiliaryfp, numProfiles, 
829                                          &scalingFactor);
830     
831     /* -- SSH -- Additional line in matrix with -INT2_MAX values */
832     if((Boolean) myargs[2].intvalue) {
833         totalProfileLength += numProfiles;
834     }
835 
836     combinedMatrix = allocateMatrix(totalProfileLength);
837     if (NULL == combinedMatrix) {
838         ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to allocate matrix with%d rows\n", totalProfileLength);
839         return (1);
840         
841     }
842     /* -- SSH -- RPS Blast data */
843     if ((Boolean) myargs[2].intvalue) {
844         seqlens = (Int4Ptr) MemNew((numProfiles +1) * sizeof(Int4));
845     } else {
846         seqlens = NULL;
847     }
848     
849     readAllMatrices(matrixnamefp, combinedMatrix, numProfiles,
850                     directoryPrefix, seqlens);
851     
852     /* -- SSH -- For RPS Blast additional info will be added to the file */
853     if ((Boolean) myargs[2].intvalue) {
854         Int4 magicNumber = RPS_MAGIC_NUMBER;
855         FileWrite(&magicNumber, sizeof(Int4), 1, bigmatrixfile);
856         FileWrite(&numProfiles, sizeof(Int4), 1, bigmatrixfile);
857         FileWrite(seqlens, sizeof(Int4), numProfiles + 1, bigmatrixfile);
858         
859         sprintf(lookupName, "%s.loo", profilesFileName);
860         RPSCreateLookupFile(combinedMatrix, numProfiles, seqlens, lookupName,
861                             scalingFactor);
862 
863         if(!RPSConcatSequences(sequencesfp, profilesFileName)) {
864             ErrPostEx(SEV_ERROR, 0,0, "Failure to concatenate sequences");
865             return 1;
866         }
867         
868     }
869     
870     FileWrite((void *) combinedMatrix[0], sizeof(ScoreRow), 
871               (size_t) totalProfileLength, bigmatrixfile);
872     freeMatrix(combinedMatrix); 
873     FileClose(bigmatrixfile);
874     FileClose(matrixnamefp);
875     FileClose(sequencesfp);
876     FileClose(auxiliaryfp);
877     return 0;
878 }
879 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.