DataCategoryTyped
A subclass of DataCategory with methods to apply explicit data typing.
Source code in mmcif/api/DataCategoryTyped.py
class DataCategoryTyped(DataCategory):
"""A subclass of DataCategory with methods to apply explicit data typing."""
def __init__(
self,
dataCategoryObj,
dictionaryApi=None,
raiseExceptions=True,
copyInputData=True,
ignoreCastErrors=False,
useCifUnknowns=True,
missingValueString=None,
missingValueInteger=None,
missingValueFloat=None,
**kwargs
):
"""A subclass of DataCategory with methods to apply explicit data typing.
Args:
dataCategoryObj (object): DataCategory object instance
dictionaryApi (object, optional): instance of DictionaryApi class. Defaults to None.
raiseExceptions (bool, optional): raise exceptions. Defaults to True.
copyInputData (bool, optional): make a new copy input data. Defaults to True.
ignoreCastErrors (bool, optional): ignore data processing cast errors. Defaults to False.
useCifUnknowns (bool, optional): use CIF style missing values ('.' and '?'). Defaults to True.
missingValueString (str, optional): missing string value . Defaults to None.
missingValueInteger (integer, optional): missing integer value. Defaults to None.
missingValueFloat (float, optional): missing float value. Defaults to None.
applyMolStarTypes (bool, optional): use Mol* forced integer types. Defaults to True.
"""
self.__dcObj = dataCategoryObj
super(DataCategoryTyped, self).__init__(
self.__dcObj.getName(),
self.__dcObj.getAttributeList(),
self.__dcObj.data,
raiseExceptions=raiseExceptions,
copyInputData=copyInputData,
)
#
self.__dApi = dictionaryApi
self.__attributeTypeD = {}
self.__castD = {"integer": int, "float": float, "string": str}
self.__applyMolStarTypes = kwargs.get("applyMolStarTypes", True)
self.__dch = DataCategoryHints()
self.__typesSet = self.applyTypes(
ignoreCastErrors=ignoreCastErrors,
useCifUnknowns=useCifUnknowns,
missingValueString=missingValueString,
missingValueInteger=missingValueInteger,
missingValueFloat=missingValueFloat,
)
def applyTypes(self, ignoreCastErrors=False, useCifUnknowns=True, missingValueString=None, missingValueInteger=None, missingValueFloat=None):
"""Cast data types (string, integer, float) in the current object based on dictionary type details.
Missing values ('.' or '?') are set to None.
Raises:
e: any exception
Returns:
bool: True for success or False otherwise
"""
ok = False
try:
for ii, atName in enumerate(self.getAttributeList()):
# colValL = self.getColumn(ii)
dataType, isMandatory = self.__getAttributeInfo(atName)
if not dataType:
if not ignoreCastErrors:
logger.warning("Undefined type for category %s attribute %s - Will treat as string", self.getName(), atName)
dataType = "string" # Treat undefined attributes as strings
missingValue = missingValueInteger if dataType == "integer" else missingValueFloat if dataType in ["integer", "float"] else missingValueString
missingValue = missingValue if not useCifUnknowns else "." if isMandatory else "?"
for row in self.data:
try:
row[ii] = self.__castD[dataType](row[ii]) if row[ii] is not None and row[ii] not in [".", "?"] else missingValue
except Exception as e:
if not ignoreCastErrors:
logger.error("Cast error %s %s (%s) %r %r", self.getName(), atName, dataType, row[ii], str(e))
row[ii] = missingValue
#
logger.debug("%s %s %r", self.getName(), atName, [row[ii] for row in self.data])
self.__attributeTypeD[atName] = dataType
ok = True
except Exception as e:
logger.exception("Failing with %s", str(e))
if self._raiseExceptions:
raise e
return ok
def getAttributeInfo(self, atName):
"""Get attribute data type (string, integer, or float) and optionality
Args:
atName (str): attribute name
Returns:
(string, bool): data type (string, integer or float) and mandatory code
"""
try:
dataType, mandatoryCode = self.__getAttributeInfo(atName)
return dataType, mandatoryCode
except Exception:
return None, None
def applyStringTypes(self):
"""Cast data types to strings in the current object. Missing values are set to '?' and '.' for
optional and mandatory attributes, respectively.
Raises:
e: any exception
Returns:
bool: True for success or False otherwise
"""
ok = False
try:
for ii, atName in enumerate(self.getAttributeList()):
_, isMandatory = self.__getAttributeInfo(atName)
dataType = "string"
for row in self.data:
if row[ii] is None or row[ii] in [".", "?"]:
row[ii] = "." if isMandatory else "?"
else:
row[ii] = self.__castD[dataType](row[ii])
#
self.__attributeTypeD[atName] = dataType
ok = True
except Exception as e:
logger.exception("Failing with %s", str(e))
if self._raiseExceptions:
raise e
return ok
def cmpAttributeValues(self, dcObj, ignoreOrder=True, **kwargs):
"""Compare the values by attribute for current typed data category (dca) and input data category.
The comparison is performed for values of the attributes common to both objects. Length differences
are treated inequality out of hand.
Args:
dcObj (object): DataCategory object
ignoreOrder (bool, optional): ignore attribute order. Defaults to True.
floatRelTolerance (float, optional): relative tolerance for float comparisons. Defaults to 1e-05.
floatAbsTolerance (float, optional): absolute tolerance for float comparisons. Defaults to 1e-04.
Raises:
e: any exception
Returns:
list: [(attributeName, values equal/close flag (bool)), (attributeName, values equal/close flag (bool), ...]
"""
rL = []
floatRelTolerance = kwargs.get("floatRelTolerance", 1.0e-05)
floatAbsTolerance = kwargs.get("floatAbsTolerance", 1.0e-04)
try:
sa = set(self.getAttributeList())
sb = set(dcObj.getAttributeList())
atNameComList = list(sa & sb)
#
lenEq = self.getRowCount() == dcObj.getRowCount()
if not lenEq:
return [(atName, False) for atName in atNameComList]
#
for atName in atNameComList:
dataType, _ = self.__getAttributeInfo(atName)
if dataType in ["string", "integer"]:
if ignoreOrder:
same = sorted(self.getAttributeValueList(atName)) == sorted(dcObj.getAttributeValueList(atName))
else:
same = self.getAttributeValueList(atName) == dcObj.getAttributeValueList(atName)
elif dataType in ["float"]:
aVL = self.getAttributeValueList(atName)
bVL = dcObj.getAttributeValueList(atName)
if ignoreOrder:
for aV, bV in zip(sorted(aVL), sorted(bVL)):
same = self.__isClose(aV, bV, relTol=floatRelTolerance, absTol=floatAbsTolerance)
if not same:
break
else:
for aV, bV in zip(aVL, bVL):
same = self.__isClose(aV, bV, relTol=floatRelTolerance, absTol=floatAbsTolerance)
if not same:
logger.info("%s %s (rel=%r) (abs=%r) %r (%r)", self.getName(), atName, aV * floatRelTolerance, floatAbsTolerance, aV, abs(aV - bV))
break
rL.append((atName, same))
#
return rL
except Exception as e:
if self._raiseExceptions:
raise e
return rL
def __getAttributeInfo(self, atName):
"""Get attribute data type (string, integer, or float) and optionality
Args:
atName (str): attribute name
Returns:
(string, bool): data type (string, integer or float) and mandatory code
"""
logger.debug("Working on cat %r, atName %r", self.getName(), atName)
cifDataType = self.__dApi.getTypeCode(self.getName(), atName)
# cifPrimitiveType = self.__dApi.getTypePrimitive(self.getName(), atName)
isMandatory = self.__dApi.getMandatoryCode(self.getName(), atName) in ["yes", "implicit", "implicit-ordinal"]
if cifDataType is None:
dataType = None
else:
dataType = self.__dch.getPdbxItemType(cifDataType)
# dataType = "integer" if "int" in cifDataType else "float" if cifPrimitiveType == "numb" else "string"
# Allow for forced Mol* integer types
if self.__applyMolStarTypes:
nm = CifName().itemName(self.getName(), atName)
if self.__dch.inMolStarIntHints(nm):
dataType = "integer"
return dataType, isMandatory
def __isClose(self, aV, bV, relTol=1e-09, absTol=1e-06):
if aV is None and bV is None:
return True
elif aV is not None and bV is not None and aV == bV:
return True
elif isinstance(aV, (float)) and isinstance(bV, (float)):
return abs(aV - bV) <= max(relTol * max(abs(aV), abs(bV)), absTol)
else:
raise ValueError
__init__(self, dataCategoryObj, dictionaryApi=None, raiseExceptions=True, copyInputData=True, ignoreCastErrors=False, useCifUnknowns=True, missingValueString=None, missingValueInteger=None, missingValueFloat=None, **kwargs)
special
A subclass of DataCategory with methods to apply explicit data typing.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataCategoryObj |
object |
DataCategory object instance |
required |
dictionaryApi |
object |
instance of DictionaryApi class. Defaults to None. |
None |
raiseExceptions |
bool |
raise exceptions. Defaults to True. |
True |
copyInputData |
bool |
make a new copy input data. Defaults to True. |
True |
ignoreCastErrors |
bool |
ignore data processing cast errors. Defaults to False. |
False |
useCifUnknowns |
bool |
use CIF style missing values ('.' and '?'). Defaults to True. |
True |
missingValueString |
str |
missing string value . Defaults to None. |
None |
missingValueInteger |
integer |
missing integer value. Defaults to None. |
None |
missingValueFloat |
float |
missing float value. Defaults to None. |
None |
applyMolStarTypes |
bool |
use Mol* forced integer types. Defaults to True. |
required |
Source code in mmcif/api/DataCategoryTyped.py
def __init__(
self,
dataCategoryObj,
dictionaryApi=None,
raiseExceptions=True,
copyInputData=True,
ignoreCastErrors=False,
useCifUnknowns=True,
missingValueString=None,
missingValueInteger=None,
missingValueFloat=None,
**kwargs
):
"""A subclass of DataCategory with methods to apply explicit data typing.
Args:
dataCategoryObj (object): DataCategory object instance
dictionaryApi (object, optional): instance of DictionaryApi class. Defaults to None.
raiseExceptions (bool, optional): raise exceptions. Defaults to True.
copyInputData (bool, optional): make a new copy input data. Defaults to True.
ignoreCastErrors (bool, optional): ignore data processing cast errors. Defaults to False.
useCifUnknowns (bool, optional): use CIF style missing values ('.' and '?'). Defaults to True.
missingValueString (str, optional): missing string value . Defaults to None.
missingValueInteger (integer, optional): missing integer value. Defaults to None.
missingValueFloat (float, optional): missing float value. Defaults to None.
applyMolStarTypes (bool, optional): use Mol* forced integer types. Defaults to True.
"""
self.__dcObj = dataCategoryObj
super(DataCategoryTyped, self).__init__(
self.__dcObj.getName(),
self.__dcObj.getAttributeList(),
self.__dcObj.data,
raiseExceptions=raiseExceptions,
copyInputData=copyInputData,
)
#
self.__dApi = dictionaryApi
self.__attributeTypeD = {}
self.__castD = {"integer": int, "float": float, "string": str}
self.__applyMolStarTypes = kwargs.get("applyMolStarTypes", True)
self.__dch = DataCategoryHints()
self.__typesSet = self.applyTypes(
ignoreCastErrors=ignoreCastErrors,
useCifUnknowns=useCifUnknowns,
missingValueString=missingValueString,
missingValueInteger=missingValueInteger,
missingValueFloat=missingValueFloat,
)
applyStringTypes(self)
Cast data types to strings in the current object. Missing values are set to '?' and '.' for optional and mandatory attributes, respectively.
Exceptions:
Type | Description |
---|---|
e |
any exception |
Returns:
Type | Description |
---|---|
bool |
True for success or False otherwise |
Source code in mmcif/api/DataCategoryTyped.py
def applyStringTypes(self):
"""Cast data types to strings in the current object. Missing values are set to '?' and '.' for
optional and mandatory attributes, respectively.
Raises:
e: any exception
Returns:
bool: True for success or False otherwise
"""
ok = False
try:
for ii, atName in enumerate(self.getAttributeList()):
_, isMandatory = self.__getAttributeInfo(atName)
dataType = "string"
for row in self.data:
if row[ii] is None or row[ii] in [".", "?"]:
row[ii] = "." if isMandatory else "?"
else:
row[ii] = self.__castD[dataType](row[ii])
#
self.__attributeTypeD[atName] = dataType
ok = True
except Exception as e:
logger.exception("Failing with %s", str(e))
if self._raiseExceptions:
raise e
return ok
applyTypes(self, ignoreCastErrors=False, useCifUnknowns=True, missingValueString=None, missingValueInteger=None, missingValueFloat=None)
Cast data types (string, integer, float) in the current object based on dictionary type details. Missing values ('.' or '?') are set to None.
Exceptions:
Type | Description |
---|---|
e |
any exception |
Returns:
Type | Description |
---|---|
bool |
True for success or False otherwise |
Source code in mmcif/api/DataCategoryTyped.py
def applyTypes(self, ignoreCastErrors=False, useCifUnknowns=True, missingValueString=None, missingValueInteger=None, missingValueFloat=None):
"""Cast data types (string, integer, float) in the current object based on dictionary type details.
Missing values ('.' or '?') are set to None.
Raises:
e: any exception
Returns:
bool: True for success or False otherwise
"""
ok = False
try:
for ii, atName in enumerate(self.getAttributeList()):
# colValL = self.getColumn(ii)
dataType, isMandatory = self.__getAttributeInfo(atName)
if not dataType:
if not ignoreCastErrors:
logger.warning("Undefined type for category %s attribute %s - Will treat as string", self.getName(), atName)
dataType = "string" # Treat undefined attributes as strings
missingValue = missingValueInteger if dataType == "integer" else missingValueFloat if dataType in ["integer", "float"] else missingValueString
missingValue = missingValue if not useCifUnknowns else "." if isMandatory else "?"
for row in self.data:
try:
row[ii] = self.__castD[dataType](row[ii]) if row[ii] is not None and row[ii] not in [".", "?"] else missingValue
except Exception as e:
if not ignoreCastErrors:
logger.error("Cast error %s %s (%s) %r %r", self.getName(), atName, dataType, row[ii], str(e))
row[ii] = missingValue
#
logger.debug("%s %s %r", self.getName(), atName, [row[ii] for row in self.data])
self.__attributeTypeD[atName] = dataType
ok = True
except Exception as e:
logger.exception("Failing with %s", str(e))
if self._raiseExceptions:
raise e
return ok
cmpAttributeValues(self, dcObj, ignoreOrder=True, **kwargs)
Compare the values by attribute for current typed data category (dca) and input data category. The comparison is performed for values of the attributes common to both objects. Length differences are treated inequality out of hand.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dcObj |
object |
DataCategory object |
required |
ignoreOrder |
bool |
ignore attribute order. Defaults to True. |
True |
floatRelTolerance |
float |
relative tolerance for float comparisons. Defaults to 1e-05. |
required |
floatAbsTolerance |
float |
absolute tolerance for float comparisons. Defaults to 1e-04. |
required |
Exceptions:
Type | Description |
---|---|
e |
any exception |
Returns:
Type | Description |
---|---|
list |
[(attributeName, values equal/close flag (bool)), (attributeName, values equal/close flag (bool), ...] |
Source code in mmcif/api/DataCategoryTyped.py
def cmpAttributeValues(self, dcObj, ignoreOrder=True, **kwargs):
"""Compare the values by attribute for current typed data category (dca) and input data category.
The comparison is performed for values of the attributes common to both objects. Length differences
are treated inequality out of hand.
Args:
dcObj (object): DataCategory object
ignoreOrder (bool, optional): ignore attribute order. Defaults to True.
floatRelTolerance (float, optional): relative tolerance for float comparisons. Defaults to 1e-05.
floatAbsTolerance (float, optional): absolute tolerance for float comparisons. Defaults to 1e-04.
Raises:
e: any exception
Returns:
list: [(attributeName, values equal/close flag (bool)), (attributeName, values equal/close flag (bool), ...]
"""
rL = []
floatRelTolerance = kwargs.get("floatRelTolerance", 1.0e-05)
floatAbsTolerance = kwargs.get("floatAbsTolerance", 1.0e-04)
try:
sa = set(self.getAttributeList())
sb = set(dcObj.getAttributeList())
atNameComList = list(sa & sb)
#
lenEq = self.getRowCount() == dcObj.getRowCount()
if not lenEq:
return [(atName, False) for atName in atNameComList]
#
for atName in atNameComList:
dataType, _ = self.__getAttributeInfo(atName)
if dataType in ["string", "integer"]:
if ignoreOrder:
same = sorted(self.getAttributeValueList(atName)) == sorted(dcObj.getAttributeValueList(atName))
else:
same = self.getAttributeValueList(atName) == dcObj.getAttributeValueList(atName)
elif dataType in ["float"]:
aVL = self.getAttributeValueList(atName)
bVL = dcObj.getAttributeValueList(atName)
if ignoreOrder:
for aV, bV in zip(sorted(aVL), sorted(bVL)):
same = self.__isClose(aV, bV, relTol=floatRelTolerance, absTol=floatAbsTolerance)
if not same:
break
else:
for aV, bV in zip(aVL, bVL):
same = self.__isClose(aV, bV, relTol=floatRelTolerance, absTol=floatAbsTolerance)
if not same:
logger.info("%s %s (rel=%r) (abs=%r) %r (%r)", self.getName(), atName, aV * floatRelTolerance, floatAbsTolerance, aV, abs(aV - bV))
break
rL.append((atName, same))
#
return rL
except Exception as e:
if self._raiseExceptions:
raise e
return rL
getAttributeInfo(self, atName)
Get attribute data type (string, integer, or float) and optionality
Parameters:
Name | Type | Description | Default |
---|---|---|---|
atName |
str |
attribute name |
required |
Returns:
Type | Description |
---|---|
(string, bool) |
data type (string, integer or float) and mandatory code |
Source code in mmcif/api/DataCategoryTyped.py
def getAttributeInfo(self, atName):
"""Get attribute data type (string, integer, or float) and optionality
Args:
atName (str): attribute name
Returns:
(string, bool): data type (string, integer or float) and mandatory code
"""
try:
dataType, mandatoryCode = self.__getAttributeInfo(atName)
return dataType, mandatoryCode
except Exception:
return None, None