PdbxReader
Utilities for reading mmCIF for data files and dictionaries.
Source code in mmcif/io/PdbxReader.py
class PdbxReader(object):
"""Utilities for reading mmCIF for data files and dictionaries."""
def __init__(self, ifh):
"""ifh - input file handle returned by open()"""
#
self.__curLineNumber = 0
self.__ifh = ifh
self.__stateDict = {"data": "ST_DATA_CONTAINER", "loop": "ST_TABLE", "global": "ST_GLOBAL_CONTAINER", "save": "ST_DEFINITION", "stop": "ST_STOP"}
def read(self, containerList, selectList=None, excludeFlag=False):
"""
Appends to input list of definition and data containers.
return
"""
sL = selectList if selectList else []
catSelectD = {k: k for k in sL}
self.__curLineNumber = 0
try:
self.__parser(self.__tokenizer(self.__ifh), containerList, categorySelectionD=catSelectD, excludeFlag=excludeFlag)
except RuntimeError as e:
# will be raised at the end of token iterator - not an error -
logger.debug("Normal termination after reading %d lines with %s", self.__curLineNumber, str(e))
except StopIteration:
# will be raised at the end of token iterator - not an error -
logger.debug("Normal termination after reading %d lines", self.__curLineNumber)
except PdbxSyntaxError as e:
logger.debug("Caught syntax exception at %d", self.__curLineNumber)
raise e
except UnicodeDecodeError as e:
logger.debug("Caught character encoding exception at %d with %s", self.__curLineNumber, str(e))
raise PdbxError("Character encoding error at line %d" % self.__curLineNumber)
except Exception as e:
raise PdbxError("Failing at line %d with %s" % (self.__curLineNumber, str(e)))
else:
raise PdbxError("Miscellaneous parsing error at line %d" % self.__curLineNumber)
def __allSelected(self, container, catSelectD):
"""Test the input container for completeness relative to the input category selection dictionary."""
nl = -1
if catSelectD:
try:
nl = container.getObjNameList()
if len(nl) <= len(catSelectD):
ok = False
else:
ok = True
logger.debug("nl %d length catSelectD %d returning %r", len(nl), len(catSelectD), ok)
except Exception:
ok = False
else:
ok = False
return ok
def __syntaxError(self, errText):
msg = " [Line: %d] %s" % (self.__curLineNumber, errText)
raise PdbxSyntaxError(msg)
def __getContainerName(self, inWord):
"""Returns the name of the data block or saveframe container"""
return str(inWord[5:]).strip()
def __getState(self, inWord):
"""Identifies reserved syntax elements and assigns an associated state.
on return: (reserved word, state)
where -
reserved word - is one of CIF syntax elements:
data, loop, global, save, or stop
state - the parser state required to process this next section.
"""
i = inWord.find("_")
if i == -1:
return None, "ST_UNKNOWN"
try:
rWord = inWord[:i].lower()
return rWord, self.__stateDict[rWord]
except Exception:
return None, "ST_UNKNOWN"
def __parser(self, tokenizer, containerList, categorySelectionD=None, excludeFlag=False):
"""Parser for PDBx data files and dictionaries.
Input - tokenizer() reentrant method recognizing data item names (_category.attribute)
quoted strings (single, double and multi-line semi-colon delimited), and unquoted
strings.
containerList - list-type container for data and definition objects parsed from
from the input file.
On return:
The input containerList is appended with data and definition objects -
"""
catSelectD = categorySelectionD if categorySelectionD is not None else {}
logger.debug("Exclude Flag %r Category selection %r", excludeFlag, catSelectD)
# Working container - data or definition
curContainer = None
# the last container of type data -
previousDataContainer = None
#
# Working category container
categoryIndex = {}
curCategory = None
#
curRow = None
state = None
# Find the first reserved word and begin capturing data.
#
while True:
curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
if curWord is None:
continue
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
break
while True:
#
# Set the current state -
#
# At this point in the processing cycle we are expecting a token containing
# either a '_category.attribute' or a reserved word.
#
if curCatName is not None:
state = "ST_KEY_VALUE_PAIR"
elif curWord is not None:
reservedWord, state = self.__getState(curWord)
else:
self.__syntaxError("Miscellaneous syntax error")
return
#
# Process _category.attribute value assignments
#
if state == "ST_KEY_VALUE_PAIR":
try:
curCategory = categoryIndex[curCatName]
except KeyError:
# A new category is encountered - create a container and add a row
curCategory = categoryIndex[curCatName] = DataCategory(curCatName)
#
# check if we have all of the selection
if not excludeFlag and self.__allSelected(curContainer, catSelectD):
return
try:
if catSelectD:
if not excludeFlag and curCatName in catSelectD:
curContainer.append(curCategory)
elif excludeFlag and curCatName not in catSelectD:
curContainer.append(curCategory)
else:
logger.debug("Skipped unselected/excluded category %s", curCatName)
else:
curContainer.append(curCategory)
except AttributeError:
self.__syntaxError("Category cannot be added to data_ block")
return
curRow = []
curCategory.append(curRow)
else:
# Recover the existing row from the category
try:
# curRow = curCategory[0]
curRow = curCategory.getRow(0)
except IndexError:
self.__syntaxError("Internal index error accessing category data")
return
# Check for duplicate attributes and add attribute to table.
if curAttName in curCategory.getAttributeList():
self.__syntaxError("Duplicate attribute encountered in category")
return
else:
curCategory.appendAttribute(curAttName)
# Get the data for this attribute from the next token
tCat, _, curQuotedString, curWord = next(tokenizer)
if tCat is not None or (curQuotedString is None and curWord is None):
self.__syntaxError("Missing data for item _%s.%s" % (curCatName, curAttName))
if curWord is not None:
#
# Validation check token for misplaced reserved words -
#
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
self.__syntaxError("Unexpected reserved word: %s" % (reservedWord))
curRow.append(curWord)
elif curQuotedString is not None:
curRow.append(curQuotedString)
else:
self.__syntaxError("Missing value in item-value pair")
curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
continue
#
# Process a loop_ declaration and associated data -
#
elif state == "ST_TABLE":
# The category name in the next curCatName,curAttName pair
# defines the name of the category container.
curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
if curCatName is None or curAttName is None:
self.__syntaxError("Unexpected token in loop_ declaration")
return
# Check for a previous category declaration.
if curCatName in categoryIndex:
self.__syntaxError("Duplicate category declaration in loop_")
return
curCategory = DataCategory(curCatName)
#
# check if we have all of the selection
if not excludeFlag and self.__allSelected(curContainer, catSelectD):
return
try:
if catSelectD:
if not excludeFlag and curCatName in catSelectD:
curContainer.append(curCategory)
elif excludeFlag and curCatName not in catSelectD:
curContainer.append(curCategory)
else:
logger.debug("Skipped unselected/excluded category %s", curCatName)
else:
curContainer.append(curCategory)
except AttributeError:
self.__syntaxError("loop_ declaration outside of data_ block or save_ frame")
return
curCategory.appendAttribute(curAttName)
# Read the rest of the loop_ declaration
while True:
curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
if curCatName is None:
break
if curCatName != curCategory.getName():
self.__syntaxError("Changed category name in loop_ declaration")
return
curCategory.appendAttribute(curAttName)
# If the next token is a 'word', check it for any reserved words -
if curWord is not None:
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
if reservedWord == "stop":
return
else:
self.__syntaxError("Unexpected reserved word after loop declaration: %s" % (reservedWord))
# Read the table of data for this loop_ -
while True:
curRow = []
curCategory.append(curRow)
for _ in curCategory.getAttributeList():
if curWord is not None:
curRow.append(curWord)
elif curQuotedString is not None:
curRow.append(curQuotedString)
curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
# loop_ data processing ends if -
# A new _category.attribute is encountered
if curCatName is not None:
break
# A reserved word is encountered
if curWord is not None:
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
break
continue
elif state == "ST_DEFINITION":
# Ignore trailing unnamed saveframe delimiters e.g. 'save'
sName = self.__getContainerName(curWord)
if sName:
curContainer = DefinitionContainer(sName)
containerList.append(curContainer)
categoryIndex = {}
curCategory = None
else:
# reset current container to the last data container
curContainer = previousDataContainer
curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
elif state == "ST_DATA_CONTAINER":
#
dName = self.__getContainerName(curWord)
if not dName:
dName = "unidentified"
curContainer = DataContainer(dName)
containerList.append(curContainer)
categoryIndex = {}
curCategory = None
previousDataContainer = curContainer
curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
elif state == "ST_STOP":
###
# curCatName, curAttName, curQuotedString, curWord = tokenizer.next()
continue
elif state == "ST_GLOBAL":
curContainer = DataContainer("blank-global")
curContainer.setGlobal()
containerList.append(curContainer)
categoryIndex = {}
curCategory = None
curCatName, curAttName, curQuotedString, curWord = next(tokenizer)
elif state == "ST_UNKNOWN":
self.__syntaxError("Unrecognized syntax element: " + str(curWord))
return
def __tokenizer(self, ifh):
"""Tokenizer method for the mmCIF syntax file -
Each return/yield from this method returns information about
the next token in the form of a tuple with the following structure.
(category name, attribute name, quoted strings, words w/o quotes or white space)
"""
#
# Regex definition for mmCIF syntax - semi-colon delimited strings are handled
# outside of this regex.
# Differentiated the regular expression to the better handle embedded quotes.
#
mmcifRe = re.compile(
r"(?:"
r"(?:_(.+?)[.](\S+))"
r"|" # _category.attribute
r"(?:['](.*?)(?:[']\s|[']$))"
r"|" # single quoted strings
r'(?:["](.*?)(?:["]\s|["]$))'
r"|" # double quoted strings
r"(?:\s*#.*$)"
r"|" # comments (dumped)
r"(\S+)" # unquoted words
r")"
)
fileIter = iter(ifh)
# Tokenizer loop begins here ---
while True:
try:
line = next(fileIter)
self.__curLineNumber += 1
# Dump comments
if line.startswith("#"):
continue
# Gobble up the entire semi-colon/multi-line delimited string and
# and stuff this into the string slot in the return tuple
#
if line.startswith(";"):
mlString = [line[1:]]
while True:
line = next(fileIter)
self.__curLineNumber += 1
if line.startswith(";"):
break
mlString.append(line)
# remove trailing new-line that is part of the \n; delimiter
mlString[-1] = mlString[-1].rstrip()
#
yield (None, None, "".join(mlString), None)
#
# Need to process the remainder of the current line -
line = line[1:]
# continue
# Apply regex to the current line consolidate the single/double
# quoted within the quoted string category
for it in mmcifRe.finditer(line):
tgroups = it.groups()
#
if tgroups[4] is not None and tgroups[4].lower() == "stop_":
continue
if tgroups != (None, None, None, None, None):
if tgroups[2] is not None:
qs = tgroups[2]
elif tgroups[3] is not None:
qs = tgroups[3]
else:
qs = None
groups = (tgroups[0], tgroups[1], qs, tgroups[4])
yield groups
except StopIteration:
return
__init__(self, ifh)
special
ifh - input file handle returned by open()
Source code in mmcif/io/PdbxReader.py
def __init__(self, ifh):
"""ifh - input file handle returned by open()"""
#
self.__curLineNumber = 0
self.__ifh = ifh
self.__stateDict = {"data": "ST_DATA_CONTAINER", "loop": "ST_TABLE", "global": "ST_GLOBAL_CONTAINER", "save": "ST_DEFINITION", "stop": "ST_STOP"}
read(self, containerList, selectList=None, excludeFlag=False)
Appends to input list of definition and data containers.
return
Source code in mmcif/io/PdbxReader.py
def read(self, containerList, selectList=None, excludeFlag=False):
"""
Appends to input list of definition and data containers.
return
"""
sL = selectList if selectList else []
catSelectD = {k: k for k in sL}
self.__curLineNumber = 0
try:
self.__parser(self.__tokenizer(self.__ifh), containerList, categorySelectionD=catSelectD, excludeFlag=excludeFlag)
except RuntimeError as e:
# will be raised at the end of token iterator - not an error -
logger.debug("Normal termination after reading %d lines with %s", self.__curLineNumber, str(e))
except StopIteration:
# will be raised at the end of token iterator - not an error -
logger.debug("Normal termination after reading %d lines", self.__curLineNumber)
except PdbxSyntaxError as e:
logger.debug("Caught syntax exception at %d", self.__curLineNumber)
raise e
except UnicodeDecodeError as e:
logger.debug("Caught character encoding exception at %d with %s", self.__curLineNumber, str(e))
raise PdbxError("Character encoding error at line %d" % self.__curLineNumber)
except Exception as e:
raise PdbxError("Failing at line %d with %s" % (self.__curLineNumber, str(e)))
else:
raise PdbxError("Miscellaneous parsing error at line %d" % self.__curLineNumber)