RDKit
Open-source cheminformatics and machine learning.
FileParserUtils.h
Go to the documentation of this file.
1//
2// Copyright (C) 2010-2022 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_FILEPARSERUTILS_H
12#define RD_FILEPARSERUTILS_H
13
14#include <string>
15#include <iostream>
17#include <boost/lexical_cast.hpp>
18#include <boost/algorithm/string.hpp>
19#include <boost/format.hpp>
21
22namespace RDKit {
23class RWMol;
24class Conformer;
25
26namespace FileParserUtils {
27template <typename T>
28T stripSpacesAndCast(const std::string &input, bool acceptSpaces = false) {
29 std::string trimmed = boost::trim_copy(input);
30 if (acceptSpaces && trimmed == "") {
31 return 0;
32 } else {
33 return boost::lexical_cast<T>(trimmed);
34 }
35}
36RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input,
37 bool acceptSpaces = true);
38RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(const std::string &input,
39 bool acceptSpaces = true);
40RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input,
41 bool acceptSpaces = true);
42
43// parses info from a V3000 CTAB into a molecule
45 int confId = -1);
46// reads a line from an MDL v3K CTAB
47RDKIT_FILEPARSERS_EXPORT std::string getV3000Line(std::istream *inStream,
48 unsigned int &line);
49
50// nAtoms and nBonds are ignored on input, set on output
52 std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
53 bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
54 bool strictParsing = true, bool expectMEND = true);
55
56// nAtoms and nBonds are used
58 std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
59 bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
60 bool strictParsing = true);
61
62//! finishes up the processing (sanitization, etc.) of a molecule read from
63//! CTAB
65 bool chiralityPossible,
66 bool sanitize, bool removeHs);
67
68//! Deprecated, please use QueryOps::replaceAtomWithQueryAtom instead
70
71//! applies a particular property to the atoms as an atom property list
72template <typename T>
73void applyMolListPropToAtoms(ROMol &mol, const std::string &pn,
74 const std::string &prefix,
75 const std::string &missingValueMarker = "n/a") {
76 std::string atompn = pn.substr(prefix.size());
77 std::string strVect = mol.getProp<std::string>(pn);
78 std::vector<std::string> tokens;
79 boost::split(tokens, strVect, boost::is_any_of(" \t\n"),
80 boost::token_compress_on);
81 if (tokens.size() < mol.getNumAtoms()) {
83 << "Property list " << pn << " too short, only " << tokens.size()
84 << " elements found. Ignoring it." << std::endl;
85 return;
86 }
87 std::string mv = missingValueMarker;
88 size_t first_token = 0;
89 if (tokens.size() == mol.getNumAtoms() + 1 && tokens[0].front() == '[' &&
90 tokens[0].back() == ']') {
91 mv = std::string(tokens[0].begin() + 1, tokens[0].end() - 1);
92 first_token = 1;
93 }
94 if (mv.empty()) {
95 BOOST_LOG(rdWarningLog) << "Missing value marker for property " << pn
96 << " is empty." << std::endl;
97 }
98 for (size_t i = first_token; i < tokens.size(); ++i) {
99 if (tokens[i] != mv) {
100 unsigned int atomid = i - first_token;
101 try {
102 T apv = boost::lexical_cast<T>(tokens[i]);
103 mol.getAtomWithIdx(atomid)->setProp(atompn, apv);
104 } catch (const boost::bad_lexical_cast &) {
106 << "Value " << tokens[i] << " for property " << pn << " of atom "
107 << atomid << " can not be parsed. Ignoring it." << std::endl;
108 }
109 }
110 }
111}
112
113//! applies all properties matching a particular prefix as an atom property
114//! list
115template <typename T>
116void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix,
117 const std::string missingValueMarker = "n/a") {
118 for (auto pn : mol.getPropList()) {
119 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
120 applyMolListPropToAtoms<T>(mol, pn, prefix, missingValueMarker);
121 }
122 }
123}
124static const std::string atomPropPrefix = "atom.";
125//! if the property name matches our rules for atom property lists, we'll
126//! apply it to the atoms
128 ROMol &mol, const std::string pn,
129 const std::string &missingValueMarker = "n/a") {
130 if (pn.find(atomPropPrefix) == 0 && pn.length() > atomPropPrefix.length()) {
131 std::string prefix = atomPropPrefix + "prop.";
132 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
133 applyMolListPropToAtoms<std::string>(mol, pn, prefix, missingValueMarker);
134 } else {
135 prefix = atomPropPrefix + "iprop.";
136 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
137 applyMolListPropToAtoms<std::int64_t>(mol, pn, prefix,
138 missingValueMarker);
139 } else {
140 prefix = atomPropPrefix + "dprop.";
141 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
142 applyMolListPropToAtoms<double>(mol, pn, prefix, missingValueMarker);
143 } else {
144 prefix = atomPropPrefix + "bprop.";
145 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
146 applyMolListPropToAtoms<bool>(mol, pn, prefix, missingValueMarker);
147 }
148 }
149 }
150 }
151 }
152}
153//! loops over all properties and applies the ones that match the rules for
154//! atom property lists to the atoms
156 ROMol &mol, const std::string &missingValueMarker = "n/a") {
157 for (auto pn : mol.getPropList()) {
158 processMolPropertyList(mol, pn, missingValueMarker);
159 }
160}
161template <typename T>
162std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName,
163 std::string missingValueMarker = "",
164 unsigned int lineSize = 190) {
165 std::string res;
166 std::string propVal;
167 if (!missingValueMarker.empty()) {
168 propVal += boost::str(boost::format("[%s] ") % missingValueMarker);
169 } else {
170 missingValueMarker = "n/a";
171 }
172 for (const auto &atom : mol.atoms()) {
173 std::string apVal = missingValueMarker;
174 if (atom->hasProp(atomPropName)) {
175 T tVal = atom->getProp<T>(atomPropName);
176 apVal = boost::lexical_cast<std::string>(tVal);
177 // seems like this should work, but it doesn't:
178 // atom->getProp(atomPropName,apVal);
179 }
180 if (propVal.length() + apVal.length() + 1 >= lineSize) {
181 // remove trailing space:
182 propVal.pop_back();
183 res += propVal + "\n";
184 propVal = "";
185 }
186 propVal += apVal + " ";
187 }
188 if (!propVal.empty()) {
189 // remove the trailing space:
190 propVal.pop_back();
191 res += propVal;
192 }
193 return res;
194}
196 ROMol &mol, const std::string &atomPropName,
197 const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
198 std::string molPropName = "atom.iprop." + atomPropName;
199 mol.setProp(molPropName,
200 getAtomPropertyList<boost::int64_t>(
201 mol, atomPropName, missingValueMarker, lineSize));
202}
204 ROMol &mol, const std::string &atomPropName,
205 const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
206 std::string molPropName = "atom.dprop." + atomPropName;
207 mol.setProp(molPropName,
208 getAtomPropertyList<double>(mol, atomPropName, missingValueMarker,
209 lineSize));
210}
212 ROMol &mol, const std::string &atomPropName,
213 const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
214 std::string molPropName = "atom.bprop." + atomPropName;
215 mol.setProp(molPropName,
216 getAtomPropertyList<bool>(mol, atomPropName, missingValueMarker,
217 lineSize));
218}
220 ROMol &mol, const std::string &atomPropName,
221 const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
222 std::string molPropName = "atom.prop." + atomPropName;
223 mol.setProp(molPropName,
224 getAtomPropertyList<std::string>(mol, atomPropName,
225 missingValueMarker, lineSize));
226}
227
228} // namespace FileParserUtils
229} // namespace RDKit
230
231#endif
#define BOOST_LOG(__arg__)
Definition: RDLog.h:92
RDKIT_RDGENERAL_EXPORT RDLogger rdWarningLog
The class for representing atoms.
Definition: Atom.h:68
The class for representing 2D or 3D conformation of a molecule.
Definition: Conformer.h:45
void getProp(const std::string &key, T &res) const
allows retrieval of a particular property value
Definition: RDProps.h:107
void setProp(const std::string &key, T val, bool computed=false) const
sets a property value
Definition: RDProps.h:77
STR_VECT getPropList(bool includePrivate=true, bool includeComputed=true) const
returns a list with the names of our properties
Definition: RDProps.h:45
Atom * getAtomWithIdx(unsigned int idx)
returns a pointer to a particular Atom
unsigned int getNumAtoms() const
returns our number of atoms
Definition: ROMol.h:395
CXXAtomIterator< MolGraph, Atom * > atoms()
C++11 Range iterator.
Definition: ROMol.h:257
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:32
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:153
void processMolPropertyList(ROMol &mol, const std::string pn, const std::string &missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT std::string getV3000CTAB(const ROMol &tmol, int confId=-1)
RDKIT_FILEPARSERS_EXPORT void finishMolProcessing(RWMol *res, bool chiralityPossible, bool sanitize, bool removeHs)
void createAtomDoublePropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input, bool acceptSpaces=true)
void createAtomIntPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input, bool acceptSpaces=true)
RDKIT_FILEPARSERS_EXPORT Atom * replaceAtomWithQueryAtom(RWMol *mol, Atom *atom)
Deprecated, please use QueryOps::replaceAtomWithQueryAtom instead.
void createAtomStringPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
void applyMolListPropToAtoms(ROMol &mol, const std::string &pn, const std::string &prefix, const std::string &missingValueMarker="n/a")
applies a particular property to the atoms as an atom property list
std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName, std::string missingValueMarker="", unsigned int lineSize=190)
void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix, const std::string missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true, bool expectMEND=true)
void processMolPropertyLists(ROMol &mol, const std::string &missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT bool ParseV2000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true)
static const std::string atomPropPrefix
RDKIT_FILEPARSERS_EXPORT std::string getV3000Line(std::istream *inStream, unsigned int &line)
T stripSpacesAndCast(const std::string &input, bool acceptSpaces=false)
RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(const std::string &input, bool acceptSpaces=true)
void createAtomBoolPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
Std stuff.
Definition: Abbreviations.h:18