Source code for pidibble.baseparsers
"""
.. module:: baseparsers
:synopsis: defines some basic string and list parsing functions
.. moduleauthor: Cameron F. Abrams, <cfa22@drexel.edu>
"""
import logging
logger=logging.getLogger(__name__)
[docs]
class ListParser:
"""
A simple parser for lists of strings, with a customizable delimiter.
"""
def __init__(self,d=','):
self.d=d
[docs]
def parse(self,string):
"""
Parse a string into a list of strings, using the specified delimiter.
If no delimiter is specified, it splits on whitespace.
Parameters
----------
string : str
The string to parse.
Returns
-------
list
A list of strings parsed from the input string.
"""
if self.d==None:
return [x for x in string.split() if x.strip()!='']
else:
return [x.strip() for x in string.split(self.d) if x.strip()!='']
[docs]
def list_parse(obj,d):
"""
A factory function to create a ListParser with a specific delimiter.
Parameters
----------
obj : type
The class to instantiate (should be ListParser).
d : str or None
The delimiter to use for parsing. If None, it will split on whitespace.
Returns
-------
function
A function that takes a string and returns a list of parsed strings.
"""
return obj(d).parse
"""
Define a dictionary of parsers for different list formats
"""
ListParsers={
'CList':list_parse(ListParser,','),
'SList':list_parse(ListParser,';'),
'WList':list_parse(ListParser,None),
'DList':list_parse(ListParser,':'),
'LList':list_parse(ListParser,'\n')
}
_cols="""
1 2 3 4 5 6 7 8
12345678901234567890123456789012345678901234567890123456789012345678901234567890"""
[docs]
class StringParser:
"""
A parser for fixed-width strings, with a customizable field map.
Parameters
----------
fmtdict : dict
A dictionary mapping field names to tuples of (type, byte_range).
typemap : dict
A dictionary mapping type names to Python types.
allowed : dict, optional
A dictionary mapping field values to allowed values, for validation.
"""
def __init__(self,fmtdict,typemap,allowed={}):
self.typemap=typemap
self.fields={k:v for k,v in fmtdict.items()}
self.allowed=allowed
[docs]
def parse(self, record):
"""
Parse a fixed-width string record into a dictionary of fields.
Parameters
----------
record : str
The fixed-width string record to parse.
Returns
-------
dict
A dictionary of fields parsed from the input record.
"""
if len(record)>80:
logger.warning('The following record exceeds 80 bytes in length:')
self.report_record_error(record)
logger.warning('Stripping...')
record = record.strip()
if len(record)>80:
raise ValueError(f'Record is too long; something wrong with your PDB file?')
input_dict={}
record+=' '*(80-len(record)) # pad
for k,v in self.fields.items():
typestring,byte_range=v
typ=self.typemap[typestring]
assert byte_range[1]<=len(record),f'{record} {byte_range}'
# using columns beginning with "1" not "0"
fieldstring=record[byte_range[0]-1:byte_range[1]]
fieldstring=fieldstring.rstrip()
try:
# if len(fieldstring)>0 and not typ==str:
# fieldstring=''
input_dict[k]='' if fieldstring=='' else typ(fieldstring)
except:
self.report_field_error(record,k)
input_dict[k]=''
if typ==str:
input_dict[k]=input_dict[k].strip()
if fieldstring in self.allowed:
assert input_dict[k] in self.allowed[fieldstring],f'Value {input_dict[k]} is not allowed for field {k}; allowed values are {self.allowed[fieldstring]}'
return input_dict
[docs]
def report_record_error(self,record,byte_range=[]):
"""
Report an error in parsing a fixed-width string record.
Parameters
----------
record : str
The fixed-width string record that caused the error.
byte_range : list, optional
A list of byte ranges to highlight in the error message.
If empty, the entire record is reported.
"""
if byte_range:
record=record[:byte_range[0]-1]+'\033[91m'+record[byte_range[0]:byte_range[1]+1]+'\033[0m'+record[byte_range[1]+1:]
repstr=_cols+'\n'+record+'|'
logger.warning(repstr)
[docs]
def report_field_error(self,record,k):
"""
Report an error in parsing a specific field from a fixed-width string record.
Parameters
----------
record : str
The fixed-width string record that caused the error.
k : str
The field name that caused the error.
"""
byte_range=self.fields[k][1]
logger.warning(f'Could not parse field {k} from bytes {byte_range}:')
self.report_record_error(record,byte_range=byte_range)
[docs]
def safe_float(x):
"""
Convert a string to a float, returning 0.0 if the string is 'nan'.
"""
if x=='nan':
return 0.0
return float(x)
[docs]
def str2int_sig(arg:str):
"""
Convert a string to an integer, returning -1 if the string is not numeric.
If the string starts with a '-', it is returned as an integer.
"""
if not arg.strip().isnumeric():
if arg.strip()[0]=='-':
return int(arg)
else:
return -1
return int(arg)