Source code for pidibble.mmcif_parse

# Author: Cameron F. Abrams <cfa22@drexel.edu>
"""
.. module:: mmcif_parse

   :synopsis: defines the MMCIF_Parser class for parsing mmCIF files

   .. moduleauthor: Cameron F. Abrams, <cfa22@drexel.edu>
   
"""

from collections import UserDict
from .pdbrecord import PDBRecord, PDBRecordDict, PDBRecordList
from .baserecord import BaseRecord
import logging
logger=logging.getLogger(__name__)

[docs] def split_ri(ri): """ Split a residue identifier into its sequence number and insertion code. Parameters ---------- ri : str or int The residue identifier, which can be a string in the format '1234A' or an integer like 1234. Returns ------- tuple A tuple containing the sequence number as an integer and the insertion code as a string. """ if type(ri)==int: # this is no insertion code r=ri i='' elif ri[-1].isdigit(): # there is no insertion code r=int(ri) i='' else: r=int(ri[:-1]) i=ri[-1] return r,i
[docs] def rectify(val): """ Convert a value to its appropriate type, handling empty strings and special cases. Parameters ---------- val : str The value to be rectified, which can be a string representation of a number or an empty string. Returns ------- int or float or str The rectified value, which is an integer if the string represents a number, a float if it can be converted, or the original string if it cannot be converted. """ if not val: return '' if val in '.?': return '' if val.isdigit(): return int(val) try: val=float(val) except: pass return val
[docs] def resolve(key,aDict): """ Stub function to resolve a key in a dictionary. This function is a placeholder and does not perform any actual resolution. """ pass
[docs] class MMCIFDict(UserDict): """ A dictionary-like class for handling mmCIF data with custom key resolution. This class extends UserDict to provide additional functionality for mmCIF data handling. Parameters ---------- data : dict The initial data to populate the MMCIFDict. linkers : dict, optional A dictionary mapping keys to other keys for resolving linked values. blankers : list, optional A list of values that should be treated as empty strings. Defaults to [' ', '', '?']. """ def __init__(self,data,linkers={},blankers=[' ','','?']): self.data=data self.linkers=linkers self.blankers=blankers
[docs] def get(self,key): """ Retrieve a value from the MMCIFDict by key, resolving linked keys if necessary. If the value is in the blankers list, it returns an empty string. Parameters ---------- key : str The key to retrieve from the MMCIFDict. Returns ------- str The value associated with the key, or an empty string if the value is in the blankers list. """ val=self[key] if val in self.blankers: return '' key_link=self.linkers.get(val,None) if key_link: if key_link in self.keys(): val=self[key_link] return val
[docs] class MMCIF_Parser: """ A parser for mmCIF files, handling the parsing of various formats and structures. Parameters ---------- mmcif_formats : dict A dictionary defining the mmCIF formats to be parsed. pdb_formats : dict A dictionary defining the PDB formats to be parsed. cif_data : object An object containing the CIF data to be parsed. """ def __init__(self,mmcif_formats,pdb_formats,cif_data): self.formats=mmcif_formats self.pdb_formats=pdb_formats self.global_maps={} self.global_ids={} self.cif_data=cif_data
[docs] def update_maps(self,maps,cifrec,idx): """ Update the global maps with values from the CIF record at a specific index. Parameters ---------- maps : dict A dictionary of maps to update, where keys are map names and values are dictionaries with 'key' and 'value' keys. cifrec : object The CIF record object containing the data to update the maps. idx : int The index in the CIF record to retrieve values from. """ for mapname,mapspec in maps.items(): if not mapname in self.global_maps: self.global_maps[mapname]={} k=mapspec['key'] v=mapspec['value'] key=rectify(cifrec.getValue(k,idx)) val=rectify(cifrec.getValue(v,idx)) if not key in self.global_maps[mapname]: self.global_maps[mapname][key]=val
[docs] def update_ids(self,idmaps,cifrec,idx): """ Update the global IDs with values from the CIF record at a specific index. Parameters ---------- idmaps : dict A dictionary of ID maps, where keys are ID names and values are the corresponding CIF record field names. cifrec : object The CIF record object containing the data to update the IDs. idx : int The index in the CIF record to retrieve values from. """ for idname,idspec in idmaps.items(): if not idname in self.global_ids: self.global_ids[idname]=[] thisid=rectify(cifrec.getValue(idspec,idx)) if not thisid in self.global_ids[idname]: self.global_ids[idname].append(thisid)
[docs] def gen_dict(self,mapspec): """ Generate a list of dictionaries based on the specified mapping specification. This method processes the mapping specification to create dictionaries that represent parsed records from the CIF data. Parameters ---------- mapspec : dict A dictionary containing the mapping specification, which includes keys like 'data_obj', 'attr_map', 'splits', 'spawns_on', 'indexes', 'map_values', 'tables', 'spawn_data', 'global_maps', 'global_ids', 'list_attr', 'signal_attr', 'signal_value', 'allcaps', and 'if_dot_replace_with'. Returns ------- list A list of dictionaries representing the parsed records based on the mapping specification. """ idicts=[] attr_map=mapspec.get('attr_map',{}) splits=mapspec.get('splits',[]) spawns_on=mapspec.get('spawns_on',None) indexes=mapspec.get('indexes',None) map_values=mapspec.get('map_values',{}) tables=mapspec.get('tables',{}) spawn_data=mapspec.get('spawn_data',{}) tables=mapspec.get('tables',{}) list_attr=mapspec.get('list_attr',{}) sigattr=mapspec.get('signal_attr',None) sigval=mapspec.get('signal_value',None) use_signal=(sigattr!=None) global_maps=mapspec.get('global_maps',{}) global_ids=mapspec.get('global_ids',{}) spawns_on=mapspec.get('spawns_on',None) allcaps=mapspec.get('allcaps',[]) if_dot_replace_with=mapspec.get('if_dot_replace_with',{}) logger.debug(f'getting cifrec for {mapspec["data_obj"]}') cifrec=self.cif_data.getObj(mapspec['data_obj']) if not tables and cifrec is not None: for idx in range(len(cifrec)): if not use_signal or (cifrec.getValue(sigattr,idx)==sigval): if global_maps: self.update_maps(global_maps,cifrec,idx) if global_ids: self.update_ids(global_ids,cifrec,idx) idict={} for k,v in attr_map.items(): if type(v)==dict: resdict={kk:rectify(cifrec.getValue(o,idx)) for kk,o in v.items()} if 'resseqnumi' in resdict: resdict['seqNum'],resdict['iCode']=split_ri(resdict['resseqnumi']) val=PDBRecord(resdict) else: val=rectify(cifrec.getValue(v,idx)) if k=='resseqnumi': idict['seqNum'],idict['iCode']=split_ri(val) else: if k in splits and ',' in val: val=[rectify(x) for x in val.split(',')] if k==spawns_on: if type(val)==str and ',' in val: val=[rectify(x) for x in val.split(',')] if k in map_values: mapper=self.global_maps[map_values[k]] if type(val)==list: logger.debug(f'mapper {mapper}') logger.debug(f'list before mapping {val}') mapped_val=list(set([str(mapper[x]) for x in val])) logger.debug(f'list after mapping {mapped_val}') try: mapped_val.sort() val=mapped_val except: raise TypeError(f'could not sort list {mapped_val} at key {k}') else: val=mapper[val] idict[k]=val if k==indexes: idict['tmp_label']=f'{k}{val}' for la,vn in list_attr.items(): from_existing=all([x in idict for x in vn]) if from_existing: idict[la]=[idict[x] for x in vn] else: idict[la]=vn if spawns_on: spdicts=self.gen_dict(mapspec['spawn_data']) if type(idict[spawns_on])==list: spawned_dicts=[] for v in idict[spawns_on]: sd=idict.copy() sd[spawns_on]=v for sp in spdicts: if sp['spawn_idx']==v: break else: raise Exception(f'(list) cannot find spawn index for {spawns_on} = {v}; spdicts: {spdicts}') spc=sp.copy() del spc['spawn_idx'] spclabel=spc.get('tmp_label','') if 'tmp_label' in spc: del spc['tmp_label'] sd.update(spc) if 'tmp_label' in sd and spclabel!='': sd['tmp_label']=f'{sd["tmp_label"]}.{spclabel}' spawned_dicts.append(sd) idicts.extend(spawned_dicts) else: spawned_dicts=[] v=idict[spawns_on] for sp in spdicts: if sp['spawn_idx']==v: break else: raise Exception(f'cannot find spawn index for {spawns_on} = {v}') spc=sp.copy() del spc['spawn_idx'] spclabel=spc.get('tmp_label','') if 'tmp_label' in spc: del spc['tmp_label'] idict.update(spc) if 'tmp_label' in idict and spclabel!='': idict['tmp_label']=f'{idict["tmp_label"]}.{spclabel}' idicts.append(idict) else: idicts.append(idict) else: tabledict={} for tname,tspec in tables.items(): tabledict[tname]=[] attr_map=tspec['row_attr_map'] bisv=tspec.get('blank_if_single_valued',[]) for i in range(len(cifrec)): tdict={} for k,v in attr_map.items(): tdict[k]=rectify(cifrec.getValue(v,i)) if k in bisv: if len(self.global_ids[k])<2: tdict[k]='' tabledict[tname].append(BaseRecord(tdict)) udict={'tables':tabledict} idicts.append(udict) if allcaps: for idict in idicts: for k,v in idict.items(): if k in allcaps: idict[k]=v.upper() return idicts
[docs] def parse(self): """ Parse the mmCIF data and generate a dictionary of :class:`pdbrecord.PDBRecord` instances. This method processes the mmCIF formats and generates a dictionary where keys are record types and values are lists of :class:`pdbrecord.PDBRecord` instances. Returns ------- PDBRecordDict A dictionary where keys are record types and values are lists of :class:`pdbrecord.PDBRecord` instances. """ recdict=PDBRecordDict() for rectype,mapspec in self.formats.items(): idicts=self.gen_dict(mapspec) for idict in idicts: this_key=idict.get('tmp_label','') reckey=rectype if not this_key else f'{rectype}.{this_key}' if reckey in recdict: if not type(recdict[reckey])==PDBRecordList: recdict[reckey]=PDBRecordList([recdict[reckey]]) idict['key']=reckey recdict[reckey].append(PDBRecord(idict)) else: idict['key']=reckey recdict[reckey]=PDBRecord(idict) return recdict