001 /*
002 * ScoringMatrix.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.alignment;
033
034 import java.io.Reader;
035 import java.io.StreamTokenizer;
036 import java.io.IOException;
037
038 /**
039 * This class implements a scoring scheme based on a substitution matrix. It is useful
040 * to represent PAM and BLOSUM family of amino acids scoring matrices. Its constructor
041 * loads such matrices from a file (or any other character stream). The following is an
042 * extract of a BLOSUM62 scoring matrix file:
043 * <CODE><BLOCKQUOTE><PRE>
044 * A R N D C Q E G H I L K M F P S T W Y V B Z X *
045 * A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
046 * R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
047 * ...
048 * B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
049 * Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
050 * X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
051 * * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
052 * </PRE></BLOCKQUOTE></CODE>
053 *
054 * <P>Matrices are expected to follow this format. They must have one row an one column
055 * for each defined character (not necessarily in the same order). Each row and column
056 * must start with a distinct character (no repetition) and all row characters must have a
057 * correspondent column, and vice versa.</P>
058 *
059 * <P>Value at position (i,j) represent the score of substituting character of row i for
060 * character of column j. Insertion penalties are specified by the last row while deletion
061 * penalties must be located at the last column (both represented by the special character
062 * defined by the <CODE>INDEL_CHAR</CODE> constant). Note that it only supports an
063 * additive gap cost function. In case any of this rules are not followed, an
064 * {@linkplain InvalidScoringMatrixException} exception is raised by the constructor.</P>
065 *
066 * <P>If a scoring operation (substitution, insertion or deletion) involves a character
067 * not found in the matrix, an exception is raised.</P>
068 *
069 * @author Sergio A. de Carvalho Jr.
070 * @see InvalidScoringMatrixException
071 */
072 public class ScoringMatrix extends ScoringScheme
073 {
074 /**
075 * The character that indicates the row and column for insertion and deletion
076 * penalties in the matrix.
077 */
078 protected static final char INDEL_CHAR = '*';
079
080 /**
081 * The character used to start a comment line in the scoring matrix file.
082 */
083 protected static final char COMMENT_CHAR = '#';
084
085 /**
086 * Stores matrix column headers in the order they were found.
087 */
088 protected String col_codes;
089
090 /**
091 * Stores matrix row headers in the order they were found.
092 */
093 protected String row_codes;
094
095 /**
096 * Stores values for each operation (substitution, insertion or deletion) defined by
097 * this matrix.
098 */
099 protected int matrix[][];
100
101 /**
102 * Dimension of the (squared) matrix.
103 */
104 protected int dimension;
105
106 /**
107 * The maximum absolute score that this matrix can return for any substitution,
108 * deletion or insertion.
109 */
110 protected int max_absolute_score;
111
112 /**
113 * Creates a new instance of a substitution matrix loaded from the character stream.
114 * The case of characters is significant when subsequently computing their score.
115 *
116 * @param input character stream from where the matrix is read
117 * @throws IOException if an I/O operation fails when reading from input
118 * @throws InvalidScoringMatrixException if the matrix does not comply with the
119 * specification
120 */
121 public ScoringMatrix (Reader input)
122 throws IOException, InvalidScoringMatrixException
123 {
124 this (input, true);
125 }
126
127 /**
128 * Creates a new instance of a substitution matrix loaded from the character stream.
129 * If <CODE>case_sensitive</CODE> is <CODE>true</CODE>, the case of characters is
130 * significant when subsequently computing their score; otherwise the case is
131 * ignored.
132 *
133 * @param input character stream from where the matrix is read
134 * @param case_sensitive <CODE>true</CODE> if the case of characters must be
135 * @throws IOException if an I/O operation fails when reading from input
136 * @throws InvalidScoringMatrixException if the matrix does not comply with the
137 * specification
138 */
139 public ScoringMatrix (Reader input, boolean case_sensitive)
140 throws IOException, InvalidScoringMatrixException
141 {
142 super (case_sensitive);
143
144 StreamTokenizer in;
145 StringBuffer buf = new StringBuffer();
146 int row, col, max_abs = 0;
147 char c;
148
149 // create a stream tokenizer on top of the input
150 // stream and set the COMMENT_CHAR as the comment character
151 in = new StreamTokenizer(input);
152 in.commentChar(COMMENT_CHAR);
153
154 // consider ends of line when reading the first row
155 in.eolIsSignificant(true);
156
157 // skip blank lines (if any)
158 for (in.nextToken(); in.ttype == StreamTokenizer.TT_EOL; in.nextToken());
159
160 // read first row: column character codes
161 while ((in.ttype != StreamTokenizer.TT_EOF) &&
162 (in.ttype != StreamTokenizer.TT_EOL))
163 {
164 if (in.ttype == StreamTokenizer.TT_WORD)
165 {
166 if (in.sval.length() > 1)
167 throw new InvalidScoringMatrixException
168 ("Column headers must have one-character only.");
169
170 buf.append(in.sval.charAt(0));
171 }
172 else if (in.ttype == INDEL_CHAR)
173 {
174 buf.append(INDEL_CHAR);
175 }
176 else
177 {
178 throw new InvalidScoringMatrixException("Column headers must be " +
179 "one-character codes or the special character '" + INDEL_CHAR + "'.");
180 }
181
182 in.nextToken();
183 }
184
185 // convert everything to upper case if it's not case sensitive
186 if (case_sensitive)
187 col_codes = buf.toString();
188 else
189 col_codes = buf.toString().toUpperCase();
190
191 dimension = col_codes.length();
192
193 // check if there's a column for deletion penalties
194 if (col_codes.indexOf (INDEL_CHAR) == -1)
195 throw new InvalidScoringMatrixException
196 ("Matrix have no column for deletion penalties.");
197
198 // check if there is at least one character code (besides the INDEL char)
199 if (dimension < 2)
200 throw new InvalidScoringMatrixException
201 ("Matrix must have at least one column with a character code.");
202
203 // check for repeated column codes
204 for (int i = 0; i < dimension; i++)
205 if (col_codes.indexOf(col_codes.charAt(i),i+1) > i)
206 throw new InvalidScoringMatrixException
207 ("Columns must have distinct one-character codes.");
208
209 // allocate matrix
210 matrix = new int[dimension][dimension];
211
212 // reset buffer
213 buf.delete (0, dimension);
214
215 // from now on, ignore ends of line
216 in.eolIsSignificant(false);
217 if (in.ttype == StreamTokenizer.TT_EOL) in.nextToken();
218
219 // read rest of matrix (one line for each character, but
220 // not necessarily in the same order as the columns)
221 for (row = 0; row < dimension && in.ttype != StreamTokenizer.TT_EOF; row++)
222 {
223 // start reading the line: the character code must come first
224 if (in.ttype == StreamTokenizer.TT_WORD)
225 {
226 if (in.sval.length() > 1)
227 throw new InvalidScoringMatrixException
228 ("Codes must have one character only.");
229
230 buf.append(in.sval.charAt(0));
231 }
232 else if (in.ttype == INDEL_CHAR)
233 {
234 buf.append(INDEL_CHAR);
235 }
236 else
237 {
238 throw new InvalidScoringMatrixException ("Rows must start with an" +
239 " one-character code or the special character '" + INDEL_CHAR + "'.");
240 }
241
242 // now, the set of values
243 for (col = 0; col < dimension; col++)
244 {
245 // start reading the values
246 if (in.nextToken() != StreamTokenizer.TT_NUMBER)
247 throw new InvalidScoringMatrixException
248 ("Invalid value at row " + (row+1) + ", column " + (col+1) + ".");
249
250 matrix[row][col] = (int) in.nval;
251
252 if (Math.abs(matrix[row][col]) > max_abs)
253 max_abs = Math.abs(matrix[row][col]);
254 }
255
256 in.nextToken();
257 }
258
259 // convert everything to upper case if it's not case sensitive
260 if (case_sensitive)
261 row_codes = buf.toString();
262 else
263 row_codes = buf.toString().toUpperCase();
264
265 // check if read as many rows as columns
266 if (row_codes.length() != dimension)
267 throw new InvalidScoringMatrixException
268 ("Matrix must have as many rows as columns.");
269
270 // check if there's a row for insertion penalties
271 if (row_codes.indexOf(INDEL_CHAR) == -1)
272 throw new InvalidScoringMatrixException
273 ("Matrix have no row for insertion penalties.");
274
275 // check for repeated row codes
276 for (int i = 0; i < dimension; i++)
277 if (row_codes.indexOf(row_codes.charAt(i),i+1) > i)
278 throw new InvalidScoringMatrixException
279 ("Rows must have distinct one-character codes.");
280
281 // check if all rows have a corresponding column
282 for (int i = 0; i < dimension; i++)
283 if (col_codes.indexOf(c = row_codes.charAt(i)) == -1)
284 throw new InvalidScoringMatrixException
285 ("There is no corresponding column for row character '" + c + "'.");
286
287 // store the maximum absolute value found
288 this.max_absolute_score = max_abs;
289 }
290
291 /**
292 * Returns the score of a substitution of character <CODE>a</CODE> for character
293 * <CODE>b</CODE> according to this scoring matrix.
294 *
295 * @param a first character
296 * @param b second character
297 * @return score of a substitution of character <CODE>a</CODE> for <CODE>b</CODE>
298 * @throws IncompatibleScoringSchemeException if this substitution is not defined
299 */
300 public int scoreSubstitution (char a, char b)
301 throws IncompatibleScoringSchemeException
302 {
303 int r,c;
304
305 if (case_sensitive)
306 {
307 r = row_codes.indexOf(a);
308 c = col_codes.indexOf(b);
309 }
310 else
311 {
312 r = row_codes.indexOf(Character.toUpperCase(a));
313 c = col_codes.indexOf(Character.toUpperCase(b));
314 }
315
316 if (r < 0 || c < 0)
317 throw new IncompatibleScoringSchemeException ("Substitution of character " +
318 a + " for " + b + " is not defined.");
319
320 return matrix[r][c];
321 }
322
323 /**
324 * Returns the score of an insertion of character <CODE>a</CODE> according to this
325 * scoring matrix.
326 *
327 * @param a character to be inserted
328 * @return score of insertion of <CODE>a</CODE>
329 * @throws IncompatibleScoringSchemeException if this character is not recognised
330 */
331 public int scoreInsertion (char a) throws IncompatibleScoringSchemeException
332 {
333 return scoreSubstitution (INDEL_CHAR, a);
334 }
335
336 /**
337 * Returns the score of a deletion of character <CODE>a</CODE> according to this
338 * scoring matrix.
339 *
340 * @param a character to be deleted
341 * @return score of deletion of <CODE>a</CODE>
342 * @throws IncompatibleScoringSchemeException if this character is not recognised
343 */
344 public int scoreDeletion (char a) throws IncompatibleScoringSchemeException
345 {
346 return scoreSubstitution (a, INDEL_CHAR);
347 }
348
349 /**
350 * Tells whether this scoring scheme supports partial matches, which it does, although
351 * a particular scoring matrix loaded by this instace might not. A partial match is
352 * a situation when two characters are not equal but, for any reason, are regarded
353 * as similar by this scoring scheme, which then returns a positive score value. This
354 * is common for amino acid scoring matrices.
355 *
356 * @return always return <CODE>true</CODE>
357 */
358 public boolean isPartialMatchSupported ()
359 {
360 return true;
361 }
362
363 /**
364 * Returns the maximum absolute score that this scoring scheme can return for any
365 * substitution, deletion or insertion.
366 *
367 * @return maximum absolute score that can be returned
368 */
369 public int maxAbsoluteScore ()
370 {
371 return max_absolute_score;
372 }
373
374 /**
375 * Returns a String representation of this scoring matrix.
376 *
377 * @return a String representation of this scoring matrix
378 */
379 public String toString ()
380 {
381 int row, col;
382
383 StringBuffer buf = new StringBuffer();
384
385 // column numbers
386 buf.append("Scoring matrix:\n\t");
387 for (col = 0; col < dimension; col++)
388 {
389 buf.append("\t" + col);
390 }
391 buf.append("\n\t");
392
393 // column headers
394 for (col = 0; col < dimension; col++)
395 {
396 buf.append('\t');
397 buf.append(col_codes.charAt(col));
398 }
399
400 // rest of matrix
401 for (row = 0; row < dimension; row++)
402 {
403 // row number and code
404 buf.append("\n" + row + "\t" + row_codes.charAt(row));
405
406 for (col = 0; col < dimension; col++)
407 {
408 buf.append('\t');
409 buf.append(matrix[row][col]);
410 }
411 }
412
413 return buf.toString();
414 }
415 }