Source code for pidibble.baseparsers
"""
.. module:: baseparsers
:synopsis: defines some basic string and list parsing functions
.. moduleauthor: Cameron F. Abrams, <cfa22@drexel.edu>
"""
import logging
logger = logging.getLogger(__name__)
[docs]
class ListParser:
"""
A simple parser for lists of strings, with a customizable delimiter.
"""
def __init__(self, d=','):
self.d = d
[docs]
def parse(self, string):
"""
Parse a string into a list of strings, using the specified delimiter.
If no delimiter is specified, it splits on whitespace.
Parameters
----------
string : str
The string to parse.
Returns
-------
list
A list of strings parsed from the input string.
"""
if self.d is None:
return [x for x in string.split() if x.strip() != '']
else:
return [x.strip() for x in string.split(self.d) if x.strip() != '']
[docs]
def list_parse(obj, d):
"""
A factory function to create a ListParser with a specific delimiter.
Parameters
----------
obj : type
The class to instantiate (should be ListParser).
d : str or None
The delimiter to use for parsing. If None, it will split on whitespace.
Returns
-------
function
A function that takes a string and returns a list of parsed strings.
"""
return obj(d).parse
"""
Define a dictionary of parsers for different list formats
"""
ListParsers = {
'CList': list_parse(ListParser, ','),
'SList': list_parse(ListParser, ';'),
'WList': list_parse(ListParser, None),
'DList': list_parse(ListParser, ':'),
'LList': list_parse(ListParser, '\n')
}
_cols = """
1 2 3 4 5 6 7 8
12345678901234567890123456789012345678901234567890123456789012345678901234567890"""
[docs]
class StringParser:
"""
A parser for fixed-width strings, with a customizable field map.
Parameters
----------
fmtdict : dict
A dictionary mapping field names to tuples of (type, byte_range).
typemap : dict
A dictionary mapping type names to Python types.
allowed : dict, optional
A dictionary mapping field values to allowed values, for validation.
"""
def __init__(self, fmtdict, typemap, allowed={}):
self.typemap = typemap
self.fields = {k: v for k, v in fmtdict.items()}
self.allowed = allowed
[docs]
def parse(self, record):
"""
Parse a fixed-width string record into a dictionary of fields.
Parameters
----------
record : str
The fixed-width string record to parse.
Returns
-------
dict
A dictionary of fields parsed from the input record.
"""
if len(record) > 80:
logger.warning('The following record exceeds 80 bytes in length:')
self.report_record_error(record)
logger.warning('Stripping...')
record = record.strip()
if len(record) > 80:
raise ValueError(f'Record is too long; something wrong with your PDB file?')
input_dict = {}
record += ' ' * (80 - len(record)) # pad
for k, v in self.fields.items():
typestring, byte_range = v
typ = self.typemap[typestring]
assert byte_range[1] <= len(record), f'{record} {byte_range}'
# using columns beginning with "1" not "0"
fieldstring = record[byte_range[0] - 1:byte_range[1]]
fieldstring = fieldstring.rstrip()
try:
# if len(fieldstring)>0 and not typ==str:
# fieldstring=''
input_dict[k] = '' if fieldstring == '' else typ(fieldstring)
except (ValueError, TypeError):
self.report_field_error(record, k)
input_dict[k] = ''
if typ == str:
input_dict[k] = input_dict[k].strip()
if fieldstring in self.allowed:
assert input_dict[k] in self.allowed[fieldstring], f'Value {input_dict[k]} is not allowed for field {k}; allowed values are {self.allowed[fieldstring]}'
return input_dict
[docs]
def report_record_error(self, record, byte_range=[]):
"""
Report an error in parsing a fixed-width string record.
Parameters
----------
record : str
The fixed-width string record that caused the error.
byte_range : list, optional
A list of byte ranges to highlight in the error message.
If empty, the entire record is reported.
"""
if byte_range:
record = record[:byte_range[0] - 1] + '\033[91m' + record[byte_range[0]:byte_range[1] + 1] + '\033[0m' + record[byte_range[1] + 1:]
repstr = _cols + '\n' + record + '|'
logger.warning(repstr)
[docs]
def report_field_error(self, record, k):
"""
Report an error in parsing a specific field from a fixed-width string record.
Parameters
----------
record : str
The fixed-width string record that caused the error.
k : str
The field name that caused the error.
"""
byte_range = self.fields[k][1]
logger.warning(f'Could not parse field {k} from bytes {byte_range}:')
self.report_record_error(record, byte_range=byte_range)
[docs]
def safe_float(x):
"""
Convert a string to a float, returning 0.0 if the string is 'nan'.
"""
if x == 'nan':
return 0.0
return float(x)
[docs]
def str2int_sig(arg: str):
"""
Convert a string to an integer, returning -1 if the string is not numeric.
If the string starts with a '-', it is returned as an integer.
"""
stripped = arg.strip()
if not stripped.isnumeric():
if stripped and stripped[0] == '-':
return int(arg)
else:
return -1
return int(arg)