|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/asn/scoremat.asn |
source navigation diff markup identifier search freetext search file search |
1 --$Id: scoremat.asn,v 1.12 2008/04/15 15:55:45 kazimird Exp $
2 -- ===========================================================================
3 --
4 -- PUBLIC DOMAIN NOTICE
5 -- National Center for Biotechnology Information
6 --
7 -- This software/database is a "United States Government Work" under the
8 -- terms of the United States Copyright Act. It was written as part of
9 -- the author's official duties as a United States Government employee and
10 -- thus cannot be copyrighted. This software/database is freely available
11 -- to the public for use. The National Library of Medicine and the U.S.
12 -- Government have not placed any restriction on its use or reproduction.
13 --
14 -- Although all reasonable efforts have been taken to ensure the accuracy
15 -- and reliability of the software and data, the NLM and the U.S.
16 -- Government do not and cannot warrant the performance or results that
17 -- may be obtained by using this software or data. The NLM and the U.S.
18 -- Government disclaim all warranties, express or implied, including
19 -- warranties of performance, merchantability or fitness for any particular
20 -- purpose.
21 --
22 -- Please cite the author in any work or product based on this material.
23 --
24 -- ===========================================================================
25 --
26 -- Author: Christiam Camacho
27 --
28 -- File Description:
29 -- ASN.1 definitions for scoring matrix
30 --
31 -- ===========================================================================
32
33 NCBI-ScoreMat DEFINITIONS ::= BEGIN
34
35 EXPORTS Pssm, PssmIntermediateData, PssmFinalData,
36 PssmParameters, PssmWithParameters;
37
38 IMPORTS Object-id FROM NCBI-General
39 Seq-entry FROM NCBI-Seqset;
40
41 -- a rudimentary block/core-model, to be used with block-based alignment
42 -- routines and threading
43
44 BlockProperty ::= SEQUENCE {
45 type INTEGER { unassigned (0),
46 threshold (1), -- score threshold for heuristics
47 minscore (2), -- observed minimum score in CD
48 maxscore (3), -- observed maximum score in CD
49 meanscore (4), -- observed mean score in CD
50 variance (5), -- observed score variance
51 name (10), -- just name the block
52 is-optional(20), -- block may not have to be used
53 other (255) },
54 intvalue INTEGER OPTIONAL,
55 textvalue VisibleString OPTIONAL
56 }
57
58 CoreBlock ::= SEQUENCE {
59 start INTEGER, -- begin of block on query
60 stop INTEGER, -- end of block on query
61 minstart INTEGER OPTIONAL, -- optional N-terminal extension
62 maxstop INTEGER OPTIONAL, -- optional C-terminal extension
63 property SEQUENCE OF BlockProperty OPTIONAL
64 }
65
66 LoopConstraint ::= SEQUENCE {
67 minlength INTEGER DEFAULT 0, -- minimum length of unaligned region
68 maxlength INTEGER DEFAULT 100000 -- maximum length of unaligned region
69 }
70
71 CoreDef ::= SEQUENCE {
72 nblocks INTEGER, -- number of core elements/blocks
73 blocks SEQUENCE OF CoreBlock, -- nblocks locations
74 loops SEQUENCE OF LoopConstraint -- (nblocks+1) constraints
75 }
76
77 -- ===========================================================================
78 -- PSI-BLAST, formatrpsdb, RPS-BLAST workflow:
79 -- ===========================================
80 --
81 -- Two possible inputs to PSI-BLAST and formatrpsdb:
82 -- 1) PssmWithParams where pssm field contains intermediate PSSM data (matrix
83 -- of frequency ratios)
84 -- 2) PssmWithParams where pssm field contains final PSSM data (matrix of
85 -- scores and statistical parameters) - such as written by cddumper
86 --
87 -- In case 1, PSI-BLAST's PSSM engine is invoked to create the PSSM and perform
88 -- the PSI-BLAST search or build the PSSM to then build the RPS-BLAST database.
89 -- In case 2, PSI-BLAST's PSSM engine is not invoked and the matrix of scores
90 -- statistical parameters are used to perform the search in PSI-BLAST and the
91 -- same data and the data in PssmWithParams::params::rpsdbparams is used to
92 -- build the PSSM and ultimately the RPS-BLAST database
93 --
94 --
95 -- reads ++++++++++++++ writes
96 -- PssmWithParams ====> + PSI-BLAST + =====> PssmWithParams
97 -- ++++++++++++++ | ^
98 -- ^ | |
99 -- | | |
100 -- +===========================================+ |
101 -- | |
102 -- +===========================================+ |
103 -- | |
104 -- reads | |
105 -- v |
106 -- +++++++++++++++ writes +++++++++++++++++++++++ |
107 -- | formatrpsdb | =====> | RPS-BLAST databases | |
108 -- +++++++++++++++ +++++++++++++++++++++++ |
109 -- ^ |
110 -- | |
111 -- | reads |
112 -- +++++++++++++ |
113 -- | RPS-BLAST | |
114 -- +++++++++++++ |
115 -- |
116 -- reads ++++++++++++ writes |
117 -- Cdd ======> | cddumper | =============================+
118 -- ++++++++++++
119 --
120 -- ===========================================================================
121
122 -- Contains the PSSM's scores and its associated statistical parameters.
123 -- Dimensions and order in which scores are stored must be the same as that
124 -- specified in Pssm::numRows, Pssm::numColumns, and Pssm::byrow
125 PssmFinalData ::= SEQUENCE {
126
127 -- PSSM's scores
128 scores SEQUENCE OF INTEGER,
129
130 -- Karlin & Altschul parameter produced during the PSSM's calculation
131 lambda REAL,
132
133 -- Karlin & Altschul parameter produced during the PSSM's calculation
134 kappa REAL,
135
136 -- Karlin & Altschul parameter produced during the PSSM's calculation
137 h REAL,
138
139 -- scaling factor used to obtain more precision when building the PSSM.
140 -- (i.e.: scores are scaled by this value). By default, PSI-BLAST's PSSM
141 -- engine generates PSSMs which are not scaled-up, however, if PSI-BLAST is
142 -- given a PSSM which contains a scaled-up PSSM (indicated by having a
143 -- scalingFactor greater than 1), then it will scale down the PSSM to
144 -- perform the initial stages of the search with it.
145 -- N.B.: When building RPS-BLAST databases, if formatrpsdb is provided
146 -- scaled-up PSSMs, it will ensure that all PSSMs used to build the
147 -- RPS-BLAST database are scaled by the same factor (otherwise, RPS-BLAST
148 -- will silently produce incorrect results).
149 scalingFactor INTEGER DEFAULT 1,
150
151 -- Karlin & Altschul parameter produced during the PSSM's calculation
152 lambdaUngapped REAL OPTIONAL,
153
154 -- Karlin & Altschul parameter produced during the PSSM's calculation
155 kappaUngapped REAL OPTIONAL,
156
157 -- Karlin & Altschul parameter produced during the PSSM's calculation
158 hUngapped REAL OPTIONAL
159 }
160
161 -- Contains the PSSM's intermediate data used to create the PSSM's scores
162 -- and statistical parameters. Dimensions and order in which scores are
163 -- stored must be the same as that specified in Pssm::numRows,
164 -- Pssm::numColumns, and Pssm::byrow
165 PssmIntermediateData ::= SEQUENCE {
166
167 -- observed residue frequencies (or counts) per position of the PSSM
168 -- (prior to application of pseudocounts)
169 resFreqsPerPos SEQUENCE OF INTEGER OPTIONAL,
170
171 -- Weighted observed residue frequencies per position of the PSSM.
172 -- (N.B.: each position's weights should add up to 1.0).
173 -- This field corresponds to f_i (f sub i) in equation 2 of
174 -- Nucleic Acids Res. 2001 Jul 15;29(14):2994-3005.
175 -- NOTE: this is needed for diagnostics information only (i.e.:
176 -- -out_ascii_pssm option in psiblast)
177 weightedResFreqsPerPos SEQUENCE OF REAL OPTIONAL,
178
179 -- PSSM's frequency ratios
180 freqRatios SEQUENCE OF REAL,
181
182 -- Information content per position of the PSSM
183 -- NOTE: this is needed for diagnostics information only (i.e.:
184 -- -out_ascii_pssm option in psiblast)
185 informationContent SEQUENCE OF REAL OPTIONAL,
186
187 -- Weights for columns of the PSSM without gaps
188 -- NOTE: this is needed for diagnostics information only (i.e.:
189 -- -out_ascii_pssm option in psiblast)
190 gaplessColumnWeights SEQUENCE OF REAL OPTIONAL,
191
192 -- Used in sequence weights computation
193 -- NOTE: this is needed for diagnostics information only (i.e.:
194 -- -out_ascii_pssm option in psiblast)
195 sigma SEQUENCE OF REAL OPTIONAL,
196
197 -- Length of the aligned regions per position of the query sequence
198 -- NOTE: this is needed for diagnostics information only (i.e.:
199 -- -out_ascii_pssm option in psiblast)
200 intervalSizes SEQUENCE OF INTEGER OPTIONAL,
201
202 -- Number of matching sequences per position of the PSSM (including the
203 -- query)
204 -- NOTE: this is needed for diagnostics information only (i.e.:
205 -- -out_ascii_pssm option in psiblast)
206 numMatchingSeqs SEQUENCE OF INTEGER OPTIONAL
207 }
208
209 -- Position-specific scoring matrix
210 --
211 -- Column indices on the PSSM refer to the positions corresponding to the
212 -- query/master sequence, i.e. the number of columns (N) is the same
213 -- as the length of the query/master sequence.
214 -- Row indices refer to individual amino acid types, i.e. the number of
215 -- rows (M) is the same as the number of different residues in the
216 -- alphabet we use. Consequently, row labels are amino acid identifiers.
217 --
218 -- PSSMs are stored as linear arrays of integers. By default, we store
219 -- them column-by-column, M values for the first column followed by M
220 -- values for the second column, and so on. In order to provide
221 -- flexibility for external applications, the boolean field "byrow" is
222 -- provided to specify the storage order.
223 Pssm ::= SEQUENCE {
224
225 -- Is the this a protein or nucleotide scoring matrix?
226 isProtein BOOLEAN DEFAULT TRUE,
227
228 -- PSSM identifier
229 identifier Object-id OPTIONAL,
230
231 -- The dimensions of the matrix are returned so the client can
232 -- verify that all data was received.
233
234 numRows INTEGER, -- number of rows
235 numColumns INTEGER, -- number of columns
236
237 -- row-labels is given to note the order of residue types so that it can
238 -- be cross-checked between applications.
239 -- If this field is not given, the matrix values are presented in
240 -- order of the alphabet ncbistdaa is used for protein, ncbi4na for nucl.
241 -- for proteins the values returned correspond to
242 -- (-,-), (-,A), (-,B), (-,C) ... (A,-), (A,A), (A,B), (A,C) ...
243 rowLabels SEQUENCE OF VisibleString OPTIONAL,
244
245 -- are matrices stored row by row?
246 byRow BOOLEAN DEFAULT FALSE,
247
248 -- PSSM representative sequence (master)
249 query Seq-entry OPTIONAL,
250
251 -- both intermediateData and finalData can be provided, but at least one of
252 -- them must be provided.
253 -- N.B.: by default PSI-BLAST will return the PSSM in its PssmIntermediateData
254 -- representation.
255
256 -- Intermediate or final data for the PSSM
257 intermediateData PssmIntermediateData OPTIONAL,
258
259 -- Final representation for the PSSM
260 finalData PssmFinalData OPTIONAL
261 }
262
263 -- This structure is used to create the RPS-BLAST database auxiliary file
264 -- (*.aux) and it contains parameters set at creation time of the PSSM.
265 -- Also, the matrixName field is used by formatrpsdb to build a PSSM from
266 -- a Pssm structure which only contains PssmIntermediateData.
267 FormatRpsDbParameters ::= SEQUENCE {
268
269 -- name of the underlying score matrix whose frequency ratios were
270 -- used in PSSM construction (e.g.: BLOSUM62)
271 matrixName VisibleString,
272
273 -- gap opening penalty corresponding to the matrix above
274 gapOpen INTEGER OPTIONAL,
275
276 -- gap extension penalty corresponding to the matrix above
277 gapExtend INTEGER OPTIONAL
278
279 }
280
281 -- Populated by PSSM engine of PSI-BLAST, original source for these values
282 -- are the PSI-BLAST options specified using the BLAST options API
283 PssmParameters ::= SEQUENCE {
284
285 -- pseudocount constant used for PSSM. This field corresponds to beta in
286 -- equation 2 of Nucleic Acids Res. 2001 Jul 15;29(14):2994-3005.
287 pseudocount INTEGER OPTIONAL,
288
289 -- data needed by formatrpsdb to create RPS-BLAST databases. matrixName is
290 -- populated by PSI-BLAST
291 rpsdbparams FormatRpsDbParameters OPTIONAL,
292
293 -- alignment constraints needed by sequence-structure threader
294 -- and other global or local block-alignment algorithms
295 constraints CoreDef OPTIONAL
296 }
297
298 -- Envelope containing PSSM and the parameters used to create it.
299 -- Provided for use in PSI-BLAST, formatrpsdb, and for the structure group.
300 PssmWithParameters ::= SEQUENCE {
301
302 -- This field is applicable to PSI-BLAST and formatrpsdb.
303 -- When both the intermediate and final PSSM data are provided in this
304 -- field, the final data (matrix of scores and associated statistical
305 -- parameters) takes precedence and that data is used for further
306 -- processing. The rationale for this is that the PSSM's scores and
307 -- statistical parameters might have been calculated by other applications
308 -- and it might not be possible to recreate it by using PSI-BLAST's PSSM
309 -- engine.
310 pssm Pssm,
311
312 -- This field's rpsdbparams is used to specify the values of options
313 -- for processing by formatrpsdb. If these are not set, the command
314 -- line defaults of formatrpsdb are applied. This field is used
315 -- by PSI-BLAST to verify that the underlying scorem matrix used to BUILD
316 -- the PSSM is the same as the one being specified through the BLAST
317 -- Options API. If this field is omitted, no verification will be
318 -- performed, so be careful to keep track of what matrix was used to build
319 -- the PSSM or else the results produced by PSI-BLAST will be unreliable.
320 params PssmParameters OPTIONAL
321 }
322
323 END
|
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |