Logo Search packages:      
Sourcecode: cdk version File versions  Download package

PubChemXMLHelper.java

/* $Revision$ $Author$ $Date$
 * 
 * Copyright (C) 2008  Egon Willighagen <egonw@users.sf.net>
 *
 * Contact: cdk-devel@lists.sourceforge.net
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 * All we ask is that proper credit is given for our work, which includes
 * - but is not limited to - adding the above copyright notice to the beginning
 * of your source code files, and to any copyright notice that you may distribute
 * with programs based on this work.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 */
package org.openscience.cdk.io.pubchemxml;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import javax.vecmath.Point2d;
import javax.vecmath.Point3d;

import org.openscience.cdk.CDKConstants;
import org.openscience.cdk.config.IsotopeFactory;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.interfaces.IChemModel;
import org.openscience.cdk.interfaces.IChemObjectBuilder;
import org.openscience.cdk.interfaces.IElement;
import org.openscience.cdk.interfaces.IMolecule;
import org.openscience.cdk.interfaces.IMoleculeSet;
import org.xmlpull.v1.XmlPullParser;

/**
 * Helper class to parse PubChem XML documents.
 *
 * @cdk.module io
 * @cdk.githash
 *
 * @author       Egon Willighagen <egonw@users.sf.net>
 * @cdk.created  2008-05-05
 */
00055 public class PubChemXMLHelper {

      private IChemObjectBuilder builder;
      private IsotopeFactory factory;
      
      /**
     * @throws java.io.IOException if there is error in getting the {@link IsotopeFactory}
     */
00063       public PubChemXMLHelper(IChemObjectBuilder builder) throws IOException {
            this.builder = builder;
            factory = IsotopeFactory.getInstance(builder);
      }

      // general elements
      public final static String EL_PCCOMPOUND = "PC-Compound";
      public final static String EL_PCCOMPOUNDS = "PC-Compounds";
      public final static String EL_PCSUBSTANCE = "PC-Substance";
      public final static String EL_PCSUBSTANCE_SID = "PC-Substance_sid";
  public final static String EL_PCCOMPOUND_ID = "PC-Compound_id";
  public final static String EL_PCCOMPOUND_CID = "PC-CompoundType_id_cid";
      public final static String EL_PCID_ID = "PC-ID_id";

      // atom block elements
      public final static String EL_ATOMBLOCK = "PC-Atoms";
      public final static String EL_ATOMSELEMENT = "PC-Atoms_element";
      public final static String EL_ATOMSCHARGE = "PC-Atoms_charge";
      public final static String EL_ATOMINT = "PC-AtomInt";
      public final static String EL_ATOMINT_AID = "PC-AtomInt_aid";
      public final static String EL_ATOMINT_VALUE = "PC-AtomInt_value";
      public final static String EL_ELEMENT = "PC-Element";
      
    // coordinate block elements
    public final static String EL_COORDINATESBLOCK = "PC-Compound_coords";
    public final static String EL_COORDINATES_AID = "PC-Coordinates_aid";
    public final static String EL_COORDINATES_AIDE = "PC-Coordinates_aid_E";
    public final static String EL_ATOM_CONFORMER = "PC-Conformer";
    public final static String EL_ATOM_CONFORMER_X = "PC-Conformer_x";
    public final static String EL_ATOM_CONFORMER_XE = "PC-Conformer_x_E";
    public final static String EL_ATOM_CONFORMER_Y = "PC-Conformer_y";
    public final static String EL_ATOM_CONFORMER_YE = "PC-Conformer_y_E";
    public final static String EL_ATOM_CONFORMER_Z = "PC-Conformer_z";
    public final static String EL_ATOM_CONFORMER_ZE = "PC-Conformer_z_E";

    // bond block elements
      public final static String EL_BONDBLOCK = "PC-Bonds";
      public final static String EL_BONDID1 = "PC-Bonds_aid1";
      public final static String EL_BONDID2 = "PC-Bonds_aid2";
      public final static String EL_BONDORDER = "PC-Bonds_order";
      
  // property block elements
  public final static String EL_PROPSBLOCK = "PC-Compound_props";
  public final static String EL_PROPS_INFODATA = "PC-InfoData";
  public final static String EL_PROPS_URNLABEL = "PC-Urn_label";
  public final static String EL_PROPS_URNNAME = "PC-Urn_name";
  public final static String EL_PROPS_SVAL = "PC-InfoData_value_sval";

    public IMoleculeSet parseCompoundsBlock(XmlPullParser parser) throws Exception {
      IMoleculeSet set = builder.newMoleculeSet();
      // assume the current element is PC-Compounds
      if (!parser.getName().equals(EL_PCCOMPOUNDS)) {
            return null;
      }

      while (parser.next() != XmlPullParser.END_DOCUMENT) {
            if (parser.getEventType() == XmlPullParser.END_TAG) {
                  if (EL_PCCOMPOUNDS.equals(parser.getName())) {
                        break; // done parsing compounds block
                  }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                  if (EL_PCCOMPOUND.equals(parser.getName())) {
                        IMolecule molecule = parseMolecule(parser, builder);
                        if (molecule.getAtomCount() > 0) {
                              // skip empty PC-Compound's
                              set.addMolecule(molecule);
                        }
                  }
            }
      }
            return set;
    }

    public IChemModel parseSubstance(XmlPullParser parser) throws Exception {
      IChemModel model = builder.newChemModel();
      // assume the current element is PC-Compound
      if (!parser.getName().equals("PC-Substance")) {
            return null;
      }

      while (parser.next() != XmlPullParser.END_DOCUMENT) {
            if (parser.getEventType() == XmlPullParser.END_TAG) {
                  if (EL_PCSUBSTANCE.equals(parser.getName())) {
                        break; // done parsing the molecule
                  }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                  if (EL_PCCOMPOUNDS.equals(parser.getName())) {
                        IMoleculeSet set = parseCompoundsBlock(parser);
                        model.setMoleculeSet(set);
                  } else if (EL_PCSUBSTANCE_SID.equals(parser.getName())) {
                        String sid = getSID(parser);
                        model.setProperty(CDKConstants.TITLE, sid);
                  }
            }
      }
            return model;
    }
      
      public String getSID(XmlPullParser parser) throws Exception {
            String sid = "unknown";
            while (parser.next() != XmlPullParser.END_DOCUMENT) {
                  if (parser.getEventType() == XmlPullParser.END_TAG) {
                  if (EL_PCSUBSTANCE_SID.equals(parser.getName())) {
                        break; // done parsing the atom block
                  }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                  if (EL_PCID_ID.equals(parser.getName())) {
                        sid = parser.nextText();
                  }
            }
            }
          return sid;
    }

    public String getCID(XmlPullParser parser) throws Exception {
        String cid = "unknown";
        while (parser.next() != XmlPullParser.END_DOCUMENT) {
            if (parser.getEventType() == XmlPullParser.END_TAG) {
                if (EL_PCCOMPOUND_ID.equals(parser.getName())) {
                    break; // done parsing the atom block
                }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                if (EL_PCCOMPOUND_CID.equals(parser.getName())) {
                    cid = parser.nextText();
                }
            }
        }
        return cid;
    }

  public void parseAtomElements(XmlPullParser parser, IMolecule molecule) throws Exception {
            while (parser.next() != XmlPullParser.END_DOCUMENT) {
                  if (parser.getEventType() == XmlPullParser.END_TAG) {
                  if (EL_ATOMSELEMENT.equals(parser.getName())) {
                        break; // done parsing the atom elements
                  }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                  if (EL_ELEMENT.equals(parser.getName())) {
                        int atomicNumber = Integer.parseInt(parser.nextText());
                        IElement element = factory.getElement(atomicNumber);
                        if (element == null) {
                              IAtom atom = molecule.getBuilder().newPseudoAtom();
                              molecule.addAtom(atom);
                        } else {
                              IAtom atom = molecule.getBuilder().newAtom(element.getSymbol());
                              atom.setAtomicNumber(element.getAtomicNumber());
                              molecule.addAtom(atom);
                        }
                  }
            }
            }
      }
      
      public void parserAtomBlock(XmlPullParser parser, IMolecule molecule) throws Exception {
            while (parser.next() != XmlPullParser.END_DOCUMENT) {
                  if (parser.getEventType() == XmlPullParser.END_TAG) {
                  if (EL_ATOMBLOCK.equals(parser.getName())) {
                        break; // done parsing the atom block
                  }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                  if (EL_ATOMSELEMENT.equals(parser.getName())) {
                        parseAtomElements(parser, molecule);
                  } else if (EL_ATOMSCHARGE.equals(parser.getName())) {
                        parseAtomCharges(parser, molecule);
                  }
            }
            }
      }
      
    public void parserCompoundInfoData(XmlPullParser parser, IMolecule molecule) throws Exception {
        String urn_label = null;
        String urn_name = null;
        String sval = null;
        while (parser.next() != XmlPullParser.END_DOCUMENT) {
            if (parser.getEventType() == XmlPullParser.END_TAG) {
                if (EL_PROPS_INFODATA.equals(parser.getName())) {
                    break; // done parsing the atom block
                }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                if (EL_PROPS_URNNAME.equals(parser.getName())) {
                    urn_name = parser.nextText();
                } else if (EL_PROPS_URNLABEL.equals(parser.getName())) {
                    urn_label = parser.nextText();
                } else if (EL_PROPS_SVAL.equals(parser.getName())) {
                    sval = parser.nextText();
                }
            }
        }
        if (urn_label != null & sval != null) {
            String property = urn_label + (urn_name == null ? "" : " (" + urn_name + ")");
            molecule.setProperty(property, sval);
        }
    }

    public void parseAtomCharges(XmlPullParser parser, IMolecule molecule) throws Exception {
      while (parser.next() != XmlPullParser.END_DOCUMENT) {
            if (parser.getEventType() == XmlPullParser.END_TAG) {
                  if (EL_ATOMSCHARGE.equals(parser.getName())) {
                        break; // done parsing the molecule
                  }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                  if (EL_ATOMINT.equals(parser.getName())) {
                        int aid = 0;
                        int charge = 0;
                        while (parser.next() != XmlPullParser.END_DOCUMENT) {
                        if (parser.getEventType() == XmlPullParser.END_TAG) {
                              if (EL_ATOMINT.equals(parser.getName())) {
                                    molecule.getAtom(aid-1).setFormalCharge(charge);
                                    break; // done parsing an atoms charge
                              }
                        } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                              if (EL_ATOMINT_AID.equals(parser.getName())) {
                                    aid = Integer.parseInt(parser.nextText());
                              } else if (EL_ATOMINT_VALUE.equals(parser.getName())) {
                                    charge = Integer.parseInt(parser.nextText());
                              }
                        }
                  }
                  }
            }
      }
    }

      public IMolecule parseMolecule(XmlPullParser parser, IChemObjectBuilder builder) throws Exception {
      IMolecule molecule = builder.newMolecule();
      // assume the current element is PC-Compound
      if (!parser.getName().equals("PC-Compound")) {
            return null;
      }

      while (parser.next() != XmlPullParser.END_DOCUMENT) {
            if (parser.getEventType() == XmlPullParser.END_TAG) {
                  if (EL_PCCOMPOUND.equals(parser.getName())) {
                        break; // done parsing the molecule
                  }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                  if (EL_ATOMBLOCK.equals(parser.getName())) {
                        parserAtomBlock(parser, molecule);
                  } else if (EL_BONDBLOCK.equals(parser.getName())) {
                        parserBondBlock(parser, molecule);
                } else if (EL_COORDINATESBLOCK.equals(parser.getName())) {
                    parserCoordBlock(parser, molecule);
          } else if (EL_PROPS_INFODATA.equals(parser.getName())) {
              parserCompoundInfoData(parser, molecule);
          } else if (EL_PCCOMPOUND_ID.equals(parser.getName())) {
              String cid = getCID(parser);
              molecule.setProperty("PubChem CID", cid);
                  }
            }
      }
            return molecule;
    }


      public void parserBondBlock(XmlPullParser parser, IMolecule molecule) throws Exception {
            List<String> id1s = new ArrayList<String>();
            List<String> id2s = new ArrayList<String>();
            List<String> orders = new ArrayList<String>();
            while (parser.next() != XmlPullParser.END_DOCUMENT) {
                  if (parser.getEventType() == XmlPullParser.END_TAG) {
                  if (EL_BONDBLOCK.equals(parser.getName())) {
                        break; // done parsing the atom block
                  }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                  if (EL_BONDID1.equals(parser.getName())) {
                        id1s = parseValues(parser, EL_BONDID1, "PC-Bonds_aid1_E");
                  } else if (EL_BONDID2.equals(parser.getName())) {
                        id2s = parseValues(parser, EL_BONDID2, "PC-Bonds_aid2_E");
                  } else if (EL_BONDORDER.equals(parser.getName())) {
                        orders = parseValues(parser, EL_BONDORDER, "PC-BondType");
                  }
            }
            }
            // aggregate information
            if (id1s.size() != id2s.size()) {
                  throw new CDKException("Inequal number of atom identifier in bond block.");
            }
            if (id1s.size() != orders.size()) {
                  throw new CDKException("Number of bond orders does not match number of bonds in bond block.");
            }
            for (int i=0; i<id1s.size(); i++) {
                  IAtom atom1 = molecule.getAtom(Integer.parseInt(id1s.get(i))-1);
                  IAtom atom2 = molecule.getAtom(Integer.parseInt(id2s.get(i))-1);
                  IBond bond = molecule.getBuilder().newBond(atom1, atom2);
                  int order = Integer.parseInt(orders.get(i));
                  if (order == 1) {
                        bond.setOrder(IBond.Order.SINGLE);
                        molecule.addBond(bond);
                  } else if (order == 2) {
                        bond.setOrder(IBond.Order.DOUBLE);
                        molecule.addBond(bond);
                  } if (order == 3) {
                        bond.setOrder(IBond.Order.TRIPLE);
                        molecule.addBond(bond);
                  } else {
                        // unknown bond order, skip
                  }
            }
      }

    public void parserCoordBlock(XmlPullParser parser, IMolecule molecule) throws Exception {
        List<String> ids = new ArrayList<String>();
        List<String> xs = new ArrayList<String>();
        List<String> ys = new ArrayList<String>();
        List<String> zs = new ArrayList<String>();
        boolean parsedFirstConformer = false;
        while (parser.next() != XmlPullParser.END_DOCUMENT) {
            if (parser.getEventType() == XmlPullParser.END_TAG) {
                if (EL_COORDINATESBLOCK.equals(parser.getName())) {
                    break; // done parsing the atom block
                } else if (EL_ATOM_CONFORMER.equals(parser.getName())) {
                    parsedFirstConformer = true;
                }
            } else if (parser.getEventType() == XmlPullParser.START_TAG &&
                       !parsedFirstConformer) {
                if (EL_COORDINATES_AID.equals(parser.getName())) {
                    ids = parseValues(parser, EL_COORDINATES_AID, EL_COORDINATES_AIDE);
                } else if (EL_ATOM_CONFORMER_X.equals(parser.getName())) {
                    xs = parseValues(parser, EL_ATOM_CONFORMER_X, EL_ATOM_CONFORMER_XE);
                } else if (EL_ATOM_CONFORMER_Y.equals(parser.getName())) {
                    ys = parseValues(parser, EL_ATOM_CONFORMER_Y, EL_ATOM_CONFORMER_YE);
                } else if (EL_ATOM_CONFORMER_Z.equals(parser.getName())) {
                    zs = parseValues(parser, EL_ATOM_CONFORMER_Z, EL_ATOM_CONFORMER_ZE);
                }
            }
        }
        // aggregate information
        boolean has2dCoords = ids.size() == xs.size() && ids.size() == ys.size();
        boolean has3dCoords = has2dCoords && ids.size() == zs.size();

        for (int i=0; i<ids.size(); i++) {
            IAtom atom = molecule.getAtom(Integer.parseInt(ids.get(i))-1);
            if (has3dCoords) {
                Point3d coord = new Point3d(
                    Double.parseDouble(xs.get(i)),
                    Double.parseDouble(ys.get(i)),
                    Double.parseDouble(zs.get(i))
                );
                atom.setPoint3d(coord);
            } else if (has2dCoords) {
                Point2d coord = new Point2d(
                    Double.parseDouble(xs.get(i)),
                    Double.parseDouble(ys.get(i))
                );
                atom.setPoint2d(coord);
            }
        }
    }

    private List<String> parseValues(XmlPullParser parser, String endTag, String fieldTag) throws Exception {
            List<String> values = new ArrayList<String>();
            while (parser.next() != XmlPullParser.END_DOCUMENT) {
                  if (parser.getEventType() == XmlPullParser.END_TAG) {
                  if (endTag.equals(parser.getName())) {
                        // done parsing the values
                        break;
                  }
            } else if (parser.getEventType() == XmlPullParser.START_TAG) {
                  if (fieldTag.equals(parser.getName())) {
                        String value = parser.nextText();
                        values.add(value);
                  }
            }
            }
            return values;
      }
      
}

Generated by  Doxygen 1.6.0   Back to index