Skip to content

DataCategoryFormatted

A subclass of DataCategory including additional formatting methods.

Source code in mmcif/api/DataCategoryFormatted.py
class DataCategoryFormatted(DataCategory):
    """A subclass of DataCategory including additional formatting methods."""

    def __init__(self, dataCategoryObj, preferDoubleQuotes=True):
        self.__dcObj = dataCategoryObj
        super(DataCategoryFormatted, self).__init__(self.__dcObj.getName(), self.__dcObj.getAttributeList(), self.__dcObj.data)
        #
        self._currentRowIndex = 0
        self._currentAttribute = None
        #
        self.__avoidEmbeddedQuoting = False
        self.__preferDoubleQuotes = preferDoubleQuotes
        #
        # --------------------------------------------------------------------
        # any whitespace
        self.__wsRe = re.compile(r"\s")
        # self.__wsAndQuotesRe=re.compile(r"[\s'\"]")
        self.__wsAndQuotesRe = re.compile(r"[\s'\"#]")
        # any newline or carriage control
        self.__nlRe = re.compile(r"[\n\r]")
        #
        # single quote
        self.__sqRe = re.compile(r"[']")
        #
        self.__sqWsRe = re.compile(r"('\s)|(\s')")

        # double quote
        self.__dqRe = re.compile(r'["]')
        self.__dqWsRe = re.compile(r'("\s)|(\s")')
        #
        self.__intRe = re.compile(r"^[0-9]+$")
        self.__floatRe = re.compile(r"^-?(([0-9]+)[.]?|([0-9]*[.][0-9]+))([(][0-9]+[)])?([eE][+-]?[0-9]+)?$")
        #
        self.__dataTypeList = [
            "DT_NULL_VALUE",
            "DT_INTEGER",
            "DT_FLOAT",
            "DT_UNQUOTED_STRING",
            "DT_ITEM_NAME",
            "DT_DOUBLE_QUOTED_STRING",
            "DT_SINGLE_QUOTED_STRING",
            "DT_MULTI_LINE_STRING",
        ]
        self.__formatTypeList = [
            "FT_NULL_VALUE",
            "FT_NUMBER",
            "FT_NUMBER",
            "FT_UNQUOTED_STRING",
            "FT_QUOTED_STRING",
            "FT_QUOTED_STRING",
            "FT_QUOTED_STRING",
            "FT_MULTI_LINE_STRING",
        ]
        #
        # try:
        #    basestring
        # except NameError:
        #    basestring = str
        #
        # self.__string_types = basestring

    def __formatPdbx(self, inp):
        """Format input data following PDBx quoting rules -"""
        try:
            if inp is None:
                return ("?", "DT_NULL_VALUE")

            # pure numerical values are returned as unquoted strings
            # if (isinstance(inp, int) or self.__intRe.search(str(inp))):
            #
            try:
                if isinstance(inp, int) or self.__intRe.search(inp):
                    return ([str(inp)], "DT_INTEGER")
            except Exception:
                pass

            # if (isinstance(inp, float) or self.__floatRe.search(str(inp))):
            try:
                if isinstance(inp, float) or self.__floatRe.search(inp):
                    return ([str(inp)], "DT_FLOAT")
            except Exception:
                pass

            # null value handling -

            if inp == "." or inp == "?":
                return ([inp], "DT_NULL_VALUE")

            if inp == "":
                return (["."], "DT_NULL_VALUE")

            # Contains white space or quotes ?
            if not self.__wsAndQuotesRe.search(inp):
                # if inp.startswith("_"):
                if inp[0] in ["_"]:
                    return (self.__doubleQuotedList(inp), "DT_ITEM_NAME")
                elif inp[0] in ["[", "]", "$", "#", ";"]:
                    return (self.__doubleQuotedList(inp), "DT_DOUBLE_QUOTED_STRING")
                elif inp[:5].lower() in ["data_", "loop_", "save_", "stop_"] or inp[:7].lower() in ["global_"]:
                    return (self.__doubleQuotedList(inp), "DT_DOUBLE_QUOTED_STRING")
                else:
                    return ([str(inp)], "DT_UNQUOTED_STRING")
            else:
                if self.__nlRe.search(inp):
                    return (self.__semiColonQuotedList(inp), "DT_MULTI_LINE_STRING")
                else:
                    if self.__preferDoubleQuotes:
                        if self.__avoidEmbeddedQuoting:
                            # change priority to choose double quoting where possible.
                            if not self.__dqRe.search(inp) and not self.__sqWsRe.search(inp):
                                return (self.__doubleQuotedList(inp), "DT_DOUBLE_QUOTED_STRING")
                            elif not self.__sqRe.search(inp) and not self.__dqWsRe.search(inp):
                                return (self.__singleQuotedList(inp), "DT_SINGLE_QUOTED_STRING")
                            else:
                                return (self.__semiColonQuotedList(inp), "DT_MULTI_LINE_STRING")
                        else:
                            # change priority to choose double quoting where possible.
                            if not self.__dqRe.search(inp):
                                return (self.__doubleQuotedList(inp), "DT_DOUBLE_QUOTED_STRING")
                            elif not self.__sqRe.search(inp):
                                return (self.__singleQuotedList(inp), "DT_SINGLE_QUOTED_STRING")
                            else:
                                return (self.__semiColonQuotedList(inp), "DT_MULTI_LINE_STRING")
                    else:
                        if self.__avoidEmbeddedQuoting:
                            # change priority to choose double quoting where possible.
                            if not self.__sqRe.search(inp) and not self.__dqWsRe.search(inp):
                                return (self.__singleQuotedList(inp), "DT_SINGLE_QUOTED_STRING")
                            elif not self.__dqRe.search(inp) and not self.__sqWsRe.search(inp):
                                return (self.__doubleQuotedList(inp), "DT_DOUBLE_QUOTED_STRING")
                            else:
                                return (self.__semiColonQuotedList(inp), "DT_MULTI_LINE_STRING")
                        else:
                            # change priority to choose double quoting where possible.
                            if not self.__sqRe.search(inp):
                                return (self.__singleQuotedList(inp), "DT_SINGLE_QUOTED_STRING")
                            elif not self.__dqRe.search(inp):
                                return (self.__doubleQuotedList(inp), "DT_DOUBLE_QUOTED_STRING")
                            else:
                                return (self.__semiColonQuotedList(inp), "DT_MULTI_LINE_STRING")
        except Exception as e:
            logger.exception("Failing with %s on input %r %r", str(e), inp, type(inp))

        return ("?", "DT_NULL_VALUE")

    def __dataTypePdbx(self, inp):
        """Detect the PDBx data type -"""
        if inp is None:
            return "DT_NULL_VALUE"

        # pure numerical values are returned as unquoted strings
        # if isinstance(inp, int) or self.__intRe.search(str(inp)):
        if isinstance(inp, int) or (isinstance(inp, string_types) and self.__intRe.search(inp)):
            return "DT_INTEGER"

        # if isinstance(inp, float) or self.__floatRe.search(str(inp)):
        if isinstance(inp, float) or (isinstance(inp, string_types) and self.__floatRe.search(inp)):
            return "DT_FLOAT"

        # null value handling -

        if inp == "." or inp == "?":
            return "DT_NULL_VALUE"

        if inp == "":
            return "DT_NULL_VALUE"

        # Contains white space or quotes ?
        if not self.__wsAndQuotesRe.search(inp):
            if inp.startswith("_"):
                return "DT_ITEM_NAME"
            else:
                return "DT_UNQUOTED_STRING"
        else:
            if self.__nlRe.search(inp):
                return "DT_MULTI_LINE_STRING"
            else:
                if self.__avoidEmbeddedQuoting:
                    if not self.__sqRe.search(inp) and not self.__dqWsRe.search(inp):
                        return "DT_DOUBLE_QUOTED_STRING"
                    elif not self.__dqRe.search(inp) and not self.__sqWsRe.search(inp):
                        return "DT_SINGLE_QUOTED_STRING"
                    else:
                        return "DT_MULTI_LINE_STRING"
                else:
                    if not self.__sqRe.search(inp):
                        return "DT_DOUBLE_QUOTED_STRING"
                    elif not self.__dqRe.search(inp):
                        return "DT_SINGLE_QUOTED_STRING"
                    else:
                        return "DT_MULTI_LINE_STRING"

    def __singleQuotedList(self, inp):
        ll = []
        ll.append("'")
        ll.append(inp)
        ll.append("'")
        return ll

    def __doubleQuotedList(self, inp):
        ll = []
        ll.append('"')
        ll.append(inp)
        ll.append('"')
        return ll

    def __semiColonQuotedList(self, inp):
        ll = []
        ll.append("\n")
        if inp[-1] == "\n":
            ll.append(";")
            ll.append(inp)
            ll.append(";")
            ll.append("\n")
        else:
            ll.append(";")
            ll.append(inp)
            ll.append("\n")
            ll.append(";")
            ll.append("\n")

        return ll

    def getValueFormatted(self, attributeName=None, rowIndex=None):
        if attributeName is None:
            attribute = self._currentAttribute
        else:
            attribute = attributeName

        if rowIndex is None:
            rowI = self._currentRowIndex
        else:
            rowI = rowIndex

        if isinstance(attribute, self._stringTypes) and isinstance(rowI, int):
            try:
                fList, _ = self.__formatPdbx(self.data[rowI][self._attributeNameList.index(attribute)])
                return "".join(fList)
            except IndexError:
                logger.exception("attributeName %s rowI %r rowdata %r", attributeName, rowI, self.data[rowI])
                raise IndexError
            except Exception as e:
                logger.exception(" Failing with %s - AttributeName %s rowI %r rowdata %r", str(e), attributeName, rowI, self.data[rowI])
        else:
            logger.error(" Type error - AttributeName %r rowI %r rowdata %r", attributeName, rowI, self.data[rowI])
            logger.error(" Type error - string types %r", self._stringTypes)
            raise TypeError(attribute)

    def getValueFormattedByIndex(self, attributeIndex, rowIndex):
        try:
            fList, _ = self.__formatPdbx(self.data[rowIndex][attributeIndex])
            return "".join(fList)
        except IndexError:
            logger.exception("attributeIndex %r rowIndex %r rowdata %r", attributeIndex, rowIndex, self.data[rowIndex][attributeIndex])
            raise IndexError
        except Exception as e:
            logger.exception("Failing with %s  - attributeIndex %r rowIndex %r rowdata %r", str(e), attributeIndex, rowIndex, self.data[rowIndex][attributeIndex])
            raise e

    def getAttributeValueMaxLengthList(self, steps=1):
        mList = [0 for i in range(len(self._attributeNameList))]
        for row in self.data[::steps]:
            for indx in range(len(self._attributeNameList)):
                val = row[indx]
                if isinstance(val, self._stringTypes):
                    tLen = len(val)
                else:
                    tLen = len(str(val))

                mList[indx] = max(mList[indx], tLen)
        return mList

    def getFormatTypeList(self, steps=1):
        try:
            curFormatTypeList = []
            curDataTypeList = ["DT_NULL_VALUE" for i in range(len(self._attributeNameList))]
            for row in self.data[::steps]:
                for indx in range(len(self._attributeNameList)):
                    val = row[indx]
                    # print "index ",indx," val ",val
                    dType = self.__dataTypePdbx(val)
                    dIndx = self.__dataTypeList.index(dType)
                    # print "d type", dType, " d type index ",dIndx

                    cType = curDataTypeList[indx]
                    cIndx = self.__dataTypeList.index(cType)
                    cIndx = max(cIndx, dIndx)
                    curDataTypeList[indx] = self.__dataTypeList[cIndx]

            # Map the format types to the data types
            curFormatTypeList = []
            for dt in curDataTypeList:
                ii = self.__dataTypeList.index(dt)
                curFormatTypeList.append(self.__formatTypeList[ii])
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return curFormatTypeList, curDataTypeList