/* $RCSfile$ * $Author: egonw $ * $Date: 2007-01-04 18:46:10 +0100 (Thu, 04 Jan 2007) $ * $Revision: 7636 $ * * Copyright (C) 2003-2007 The Chemistry Development Kit (CDK) project * * Contact: cdk-devel@lists.sourceforge.net * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * All we ask is that proper credit is given for our work, which includes * - but is not limited to - adding the above copyright notice to the beginning * of your source code files, and to any copyright notice that you may distribute * with programs based on this work. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. */ package org.openscience.cdk.io; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.Reader; import java.util.Iterator; import java.util.List; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.openscience.cdk.CDKConstants; import org.openscience.cdk.ChemFile; import org.openscience.cdk.ChemModel; import org.openscience.cdk.ChemSequence; import org.openscience.cdk.EnzymeResidueLocator; import org.openscience.cdk.PseudoAtom; import org.openscience.cdk.Reaction; import org.openscience.cdk.ReactionSet; import org.openscience.cdk.dict.DictRef; import org.openscience.cdk.dict.DictionaryDatabase; import org.openscience.cdk.exception.CDKException; import org.openscience.cdk.interfaces.IAtom; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IChemFile; import org.openscience.cdk.interfaces.IChemModel; import org.openscience.cdk.interfaces.IChemObject; import org.openscience.cdk.interfaces.IChemSequence; import org.openscience.cdk.io.formats.IResourceFormat; import org.openscience.cdk.io.formats.MACiEFormat; import org.openscience.cdk.io.setting.BooleanIOSetting; import org.openscience.cdk.io.setting.IOSetting; import org.openscience.cdk.io.setting.IntegerIOSetting; import org.openscience.cdk.io.setting.StringIOSetting; import org.openscience.cdk.tools.LoggingTool; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import org.openscience.cdk.tools.manipulator.ReactionManipulator; /** * Reads an export from the MACiE enzyme reaction database. * Information about this database can be obtained from * Gemma Holiday, Cambridge University, UK, and Gail Bartlett, * European Bioinformatics Institute, Hinxton, UK. * * <p>This implementation is based on a dump from their database * on 2003-07-14. * * @cdk.module experimental * * @author Egon Willighagen * @cdk.created 2003-07-24 * * @cdk.keyword file format, MACiE RDF * @cdk.require java1.4+ */ 00087 public class MACiEReader extends DefaultChemObjectReader { /** Property it will put on ChemModel */ 00090 public final static String CreationDate = "org.openscience.cdk.io.MACiE.CreationDate"; /** Property it will put on ChemModel */ 00092 public final static String MedlineID = "org.openscience.cdk.io.MACiE.MedlineID"; /** Property it will put on ChemModel */ 00094 public final static String PDBCode = "org.openscience.cdk.io.MACiE.PDBCode"; /** Property it will put on ChemModel */ 00096 public final static String ECNumber = "org.openscience.cdk.io.MACiE.ECNumber"; /** Property it will put on ChemModel */ 00098 public final static String EnzymeName = "org.openscience.cdk.io.MACiE.EnzymeName"; private LineNumberReader input = null; private LoggingTool logger = null; private IntegerIOSetting selectedEntry; private BooleanIOSetting readSecondaryFiles; private StringIOSetting readSecondaryDir; private Pattern topLevelDatum; private Pattern subLevelDatum; private Pattern annotationTuple; private Pattern residueLocator; private ChemModel currentEntry; private Reaction currentReaction; private ReactionSet currentReactionStepSet; private String reactionStepAnnotation; private String reactionStepComments; private boolean readThisEntry = true; /** * Contructs a new MACiEReader that can read Molecule from a given Reader. * * @param in The Reader to read from */ 00126 public MACiEReader(Reader in) { this(); this.input = new LineNumberReader(in); } public MACiEReader(InputStream input) { this(new InputStreamReader(input)); } public MACiEReader() { logger = new LoggingTool(this); /* compile patterns */ topLevelDatum = Pattern.compile("(.+):(.+)"); subLevelDatum = Pattern.compile("(.+):(.+)\\((.+)\\):(.+)"); annotationTuple = Pattern.compile("(\\w+)=\\((.+?)\\);(.*)"); residueLocator = Pattern.compile("[A-Z][a-z][a-z]\\d{1,5}"); // e.g. Lys150 initIOSettings(); } 00147 public IResourceFormat getFormat() { return MACiEFormat.getInstance(); } 00151 public void setReader(Reader input) throws CDKException { if (input instanceof LineNumberReader) { this.input = (LineNumberReader)input; } else { this.input = new LineNumberReader(input); } } 00159 public void setReader(InputStream input) throws CDKException { setReader(new InputStreamReader(input)); } 00163 public boolean accepts(Class classObject) { Class[] interfaces = classObject.getInterfaces(); for (int i=0; i<interfaces.length; i++) { if (IChemModel.class.equals(interfaces[i])) return true; if (IChemFile.class.equals(interfaces[i])) return true; if (IChemSequence.class.equals(interfaces[i])) return true; } return false; } /** * Takes an object which subclasses IChemObject, e.g. Molecule, and will read * this (from file, database, internet etc). If the specific implementation * does not support a specific IChemObject it will throw an Exception. * * @param object The object that subclasses IChemObject * @return The IChemObject read * @exception CDKException */ 00182 public IChemObject read(IChemObject object) throws CDKException { customizeJob(); try { if (object instanceof IChemSequence) { return readReactions(false); } else if (object instanceof IChemModel) { return readReactions(true); } else if (object instanceof IChemFile) { IChemFile chemFile = object.getBuilder().newChemFile(); chemFile.addChemSequence((ChemSequence)readReactions(false)); return chemFile; } } catch (IOException exception) { String message = "Error while reading file, line number: " + input.getLineNumber(); logger.error(message); logger.debug(exception); throw new CDKException(message, exception); } throw new CDKException("Only supported are ChemSequence and ChemModel."); } public boolean accepts(IChemObject object) { if (object instanceof ChemSequence) { return true; } else if (object instanceof ChemModel) { return true; } else if (object instanceof ChemFile) { return true; } else if (object == null) { logger.warn("MACiEReader can not read null objects."); } else { logger.warn("MACiEReader can not read IChemObject of type: ", object.getClass().getName()); } return false; } /** * Read a Reaction from a file in MACiE RDF format. * * @return The Reaction that was read from the MDL file. */ 00227 private IChemObject readReactions(boolean selectEntry) throws CDKException, IOException { ChemSequence entries = new ChemSequence(); currentEntry = null; int entryCounter = 0; currentReactionStepSet = null; while (input.ready()) { String line = input.readLine(); if (line.startsWith("$RDFILE")) { entries = new ChemSequence(); } else if (line.startsWith("$DATM")) { entries.setProperty(CreationDate, line.substring(7)); } else if (line.startsWith("$RIREG")) { // new entry, store previous entry if any if (currentEntry != null) { // store previous entry currentEntry.setReactionSet(currentReactionStepSet); createNiceMACiETitle(currentEntry); entries.addChemModel(currentEntry); fireFrameRead(); if (selectEntry && (entryCounter == selectedEntry.getSettingValue())) { logger.info("Starting reading wanted frame: ", selectedEntry); return currentEntry; } else { logger.debug("Not reading unwanted frame: " + entryCounter); } } currentEntry = new ChemModel(); entryCounter++; if (!selectEntry || entryCounter == selectedEntry.getSettingValue()) { readThisEntry = true; } else { readThisEntry = false; } currentReactionStepSet = new ReactionSet(); } else if (line.startsWith("$DTYPE")) { String[] tuple = readDtypeDatumTuple(line); String dataType = tuple[0]; String datum = tuple[1]; // now some regular expression wizardry Matcher subLevelMatcher = subLevelDatum.matcher(dataType); if (subLevelMatcher.matches()) { // sub level field found String field = subLevelMatcher.group(2); String fieldNumber = subLevelMatcher.group(3); String subfield = subLevelMatcher.group(4); processSubLevelField(field, fieldNumber, subfield, datum); } else { Matcher topLevelMatcher = topLevelDatum.matcher(dataType); if (topLevelMatcher.matches()) { // top level field found String field = topLevelMatcher.group(2); processTopLevelField(field, datum); } else { logger.error("Could not parse datum tuple of type ", dataType, " around line " + input.getLineNumber()); } } } else { logger.warn("Unrecognized command on line " + input.getLineNumber(), ": ", line); } } if (currentEntry != null) { createNiceMACiETitle(currentEntry); // store last entry currentEntry.setReactionSet(currentReactionStepSet); entries.addChemModel(currentEntry); fireFrameRead(); } if (selectEntry) { // apparently selected last one, other already returned return currentEntry; } return entries; } private void createNiceMACiETitle(ChemModel chemModel) { chemModel.setProperty(CDKConstants.TITLE, "MACIE " + currentEntry.getProperty(EnzymeName) + "= " + "PDB: " + currentEntry.getProperty(PDBCode) + ", " + "EC: " + currentEntry.getProperty(ECNumber) ); } private String[] readDtypeDatumTuple(String triggerLine) throws IOException { String dTypeLine = triggerLine; String datumLine = input.readLine(); String type = dTypeLine.substring(7); String datum = datumLine.substring(7); logger.debug("Tuple TYPE: ", type); String line = datum; if (datum.endsWith("$MFMT")) { // deal with MDL mol content StringBuffer fullDatum = new StringBuffer(); do { line = input.readLine(); fullDatum.append(line); } while (!(line.equals("M END"))); datum = fullDatum.toString(); } else if (datum.endsWith("+") && (datum.length() >= 74)) { // deal with multiline fields StringBuffer fullDatum = new StringBuffer(); fullDatum.append(datum.substring(0,datum.length()-1)); do { line = input.readLine(); if (line.length() > 0) fullDatum.append(line.substring(0,line.length()-1)); } while (line.endsWith("+")); datum = fullDatum.toString(); } logger.debug(" DATUM: ", datum); String[] tuple = new String[2]; tuple[0] = type; tuple[1] = datum; return tuple; } private void processTopLevelField(String field, String datum) throws IOException, CDKException { logger.debug("Processing top level field"); if (field.equals("UNIQUE IDENTIFIER")) { currentEntry.setID("MACIE-" + datum); } else if (field.equals("EC NUMBER")) { currentEntry.setProperty(ECNumber, datum); } else if (field.equals("PDB CODE")) { currentEntry.setProperty(PDBCode, datum); } else if (field.equals("ENZYME NAME")) { currentEntry.setProperty(EnzymeName, datum); } else { logger.warn("Unrecognized ROOT field ", field, " around line " + input.getLineNumber()); } } private void processSubLevelField(String field, String fieldNumber, String subfield, String datum) throws IOException, CDKException { logger.debug("Processing sub level field"); if (field.equals("OVERALL REACTION")) { if (subfield.equals("REACTION_ID")) { if (readSecondaryFiles.isSet() && readThisEntry) { // parse referenced file String filename = readSecondaryDir.getSetting() + datum + ".rxn"; File file = new File(filename); if (file.exists()) { logger.info("Reading overall reaction from: ", filename); FileReader reader = new FileReader(file); IChemObjectReader rxnReader = new ReaderFactory().createReader(reader); currentReaction = (Reaction)rxnReader.read(new Reaction()); currentReaction.setID(datum); currentReaction.setProperty(CDKConstants.TITLE, "Overall Reaction"); // don't add it now, wait until annotation is parsed } else { String error = "Cannot find secondary file: " + filename; logger.error(error); throw new CDKException(error); } } else { logger.warn("Not reading overall reaction for this entry"); } } else if (subfield.equals("OVERALL REACTION ANNOTATION")) { parseReactionAnnotation(datum, currentReaction); currentReactionStepSet.addReaction(currentReaction); } } else if (field.equals("REACTION STAGES")) { if (subfield.equals("REACTION STAGES")) { // new reaction step // cannot create one, because CDK io does not // allow that (yet) reactionStepAnnotation = null; reactionStepComments = null; } else if (subfield.equals("ANNOTATION")) { reactionStepAnnotation = datum; } else if (subfield.equals("COMMENTS")) { reactionStepComments = datum; } else if (subfield.equals("STEP_ID")) { // read secondary RXN files? if (readSecondaryFiles.isSet() && readThisEntry) { // parse referenced file String filename = readSecondaryDir.getSetting() + datum + ".rxn"; File file = new File(filename); if (file.exists()) { logger.info("Reading reaction step from: ", filename); FileReader reader = new FileReader(file); IChemObjectReader rxnReader = new ReaderFactory().createReader(reader); currentReaction = (Reaction)rxnReader.read(new Reaction()); currentReaction.setID(datum); currentReaction.setProperty(CDKConstants.TITLE, "Step " + fieldNumber); } else { logger.error("Cannot find secondary file: ", filename); } // convert PseudoAtom's in EnzymeResidueLocator's if appropriate markEnzymeResidueLocatorAtoms(currentReaction); // now parse annotation if (reactionStepAnnotation != null) { parseReactionAnnotation(reactionStepAnnotation, currentReaction); } // and set comments if (reactionStepComments != null) { currentReaction.setProperty(CDKConstants.COMMENT, reactionStepComments); } // now, I'm ready to add reaction currentReactionStepSet.addReaction(currentReaction); } else { logger.warn("Not reading reactions of this entry."); } } } else if (field.equals("SUBSTRATES")) { logger.warn("Ignoring top level definition of substrates"); } else if (field.equals("PRODUCTS")) { logger.warn("Ignoring top level definition of products"); } else if (field.equals("REFERENCES")) { if (subfield.equals("MEDLINE_ID")) { currentEntry.setProperty(MedlineID, datum); } } else { logger.error("Unrecognized sub level field ", field, " around line " + input.getLineNumber()); } } private void markEnzymeResidueLocatorAtoms(Reaction currentReaction) { Iterator containers = ReactionManipulator.getAllAtomContainers(currentReaction).iterator(); while (containers.hasNext()) { IAtomContainer ac = (IAtomContainer) containers.next(); for (int i=0; i<ac.getAtomCount(); i++) { IAtom atom = ac.getAtom(i); if (atom instanceof EnzymeResidueLocator) { // skip atom } else if (atom instanceof PseudoAtom) { PseudoAtom pseudo = (PseudoAtom)atom; logger.debug("pseudo atom label: ", pseudo.getLabel()); logger.debug("pseudo class: ", pseudo.getClass().getName()); Matcher residueLocatorMatcher = residueLocator.matcher(pseudo.getLabel()); if (residueLocatorMatcher.matches()) { logger.debug("Found residueLocator: ", pseudo.getLabel()); // replace atom with enzymeResidueLocator IAtomContainer container = ReactionManipulator.getRelevantAtomContainer( currentReaction, pseudo ); logger.debug("Replacing the pseudo atom with a ezymeResidueLocator atom"); AtomContainerManipulator.replaceAtomByAtom(container, pseudo, new EnzymeResidueLocator(pseudo)); } } } } } private void parseReactionAnnotation(String annotation, Reaction reaction) { logger.debug("Parsing annotation..."); Matcher annotationTupleMatcher = annotationTuple.matcher(annotation); while (annotationTupleMatcher.matches()) { String field = annotationTupleMatcher.group(1); String value = annotationTupleMatcher.group(2); processAnnotation(field, value, reaction); // eat next part of annotation String remainder = annotationTupleMatcher.group(3); annotationTupleMatcher = annotationTuple.matcher(remainder); } } private void processAnnotation(String field, String value, Reaction reaction) { logger.debug("Annote: ", field, "=", value); if (field.equals("RxnAtts") || field.equals("RxnType")) { // reaction attributes /*String dictionary = "macie"; if (value.equals("Acid") || value.equals("Base")) { dictionary = "chemical"; }*/ addDictRefedAnnotation(reaction, "Attributes", value); } else if (field.equals("ResiduesPresent") || field.equals("GroupTransferred") || field.equals("BondFormed") || field.equals("ReactiveCentres") || field.equals("BondCleaved") || field.equals("BondFormed") || field.equals("Products") || field.equals("ResiduesPresent")) { reaction.setProperty(new DictRef("macie:" + field, value), value); } else if (field.equals("Reversible")) { if (value.equalsIgnoreCase("yes")) { reaction.setDirection(Reaction.BIDIRECTIONAL); addDictRefedAnnotation(reaction, "ReactionType", "ReversibleReaction"); } } else if (field.equals("OverallReactionType")) { StringTokenizer tokenizer = new StringTokenizer(value, ","); int i = 0; while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); i++; reaction.setProperty( DictionaryDatabase.DICTREFPROPERTYNAME + ":field:overallReactionType:" + i, "macie:" + token.toLowerCase() ); } } else { Matcher residueLocatorMatcher = residueLocator.matcher(field); if (residueLocatorMatcher.matches()) { logger.debug("Found residueLocator: ", field); boolean found = false; Iterator containers = ReactionManipulator.getAllAtomContainers(reaction).iterator(); while (containers.hasNext()) { IAtomContainer ac = (IAtomContainer) containers.next(); logger.debug("Searching for given residueLocator through #atom: ", ac.getAtomCount()); logger.debug("Taken from reaction ", reaction.getID()); for (int i=0; (i<ac.getAtomCount() && !found); i++) { if (ac.getAtom(i) instanceof PseudoAtom) { // that is what we are looking for PseudoAtom atom = (PseudoAtom)ac.getAtom(i); if (atom.getLabel().equals(field)) { // we have a hit, now mark Atom with dict refs addDictRefedAnnotation(atom, "ResidueRole", value); found = true; } } } } if (!found) { logger.error("MACiE annotation mentions a residue that does not exist: " + field); } } else { logger.error("Did not parse annotation: ", field); } } } private void addDictRefedAnnotation(IChemObject object, String type, String values) { StringTokenizer tokenizer = new StringTokenizer(values, ","); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); object.setProperty(new DictRef("macie:" + type, token), token); logger.debug("Added dict ref ", token, " to ", object.getClass().getName()); } } 00569 public void close() throws IOException { input.close(); } private void initIOSettings() { selectedEntry = new IntegerIOSetting("SelectedEntry", IOSetting.LOW, "Which entry should I read?", "1"); readSecondaryFiles = new BooleanIOSetting("ReadSecondaryFiles", IOSetting.LOW, "Should I read the secondary files (if available)?", "true"); readSecondaryDir = new StringIOSetting("ReadSecondaryDir", IOSetting.LOW, "Where can the secondary files be found?", System.getProperty("user.home") + System.getProperty("file.separator")); } private void customizeJob() { fireIOSettingQuestion(selectedEntry); fireIOSettingQuestion(readSecondaryFiles); fireIOSettingQuestion(readSecondaryDir); } 00593 public IOSetting[] getIOSettings() { IOSetting[] settings = new IOSetting[3]; settings[0] = selectedEntry; settings[1] = readSecondaryFiles; settings[2] = readSecondaryDir; return settings; } }