Source code for cobra.io.mat

"""Provide functions for I/O in MATLAB (.mat) format."""

import logging
import re
from collections import OrderedDict
from pathlib import Path
from typing import IO, Dict, Iterable, List, Optional, Pattern, Union

import numpy as np

from ..core import Gene, Group, Metabolite, Model, Object, Reaction
from ..util import create_stoichiometric_matrix
from ..util.solver import set_objective


try:
    import scipy.io as scipy_io
    import scipy.sparse as scipy_sparse
except ImportError:
[docs] scipy_sparse = None
scipy_io = None
[docs]logger = logging.getLogger(__name__)
# The following dictionaries are based on # https://github.com/opencobra/cobratoolbox/blob/docs/source/notes/COBRAModelFields.md # at commit 83d26938a9babff79289d40e20f5f50dd5b710fa # Which is the most updated master as of April 4th, 2022
[docs]MET_MATLAB_TO_PROVIDERS = { "metHMDBID": "hmdb", "metInChIString": "inchi", "metKEGGID": "kegg.compound", "metKEGGGlycanID": "kegg.glycan", "metKEGGDrugID": "kegg.drug", "metUniPathway": "unipathway.compound", "metPubChemID": "pubchem.compound", "metPubChemSubstance": "pubchem.substance", "metCHEBIID": "chebi", "metMetaNetXID": "metanetx.chemical", "metSEEDID": "seed.compound", "metBiGGID": "bigg.metabolite", "metBioCycID": "biocyc", "metEnviPathID": "envipath", "metLIPIDMAPSID": "lipidmaps", "metReactomeID": "reactome", "metSABIORKID": "sabiork.compound", "metSLMID": "slm", "metSMILES": "SMILES", "metSBOTerms": "sbo", "metCasNumber": "cas", }
[docs]MET_PROVIDERS_TO_MATLAB = { MET_MATLAB_TO_PROVIDERS[k]: k for k in MET_MATLAB_TO_PROVIDERS.keys() }
[docs]MET_NOTES_TO_MATLAB = { "metNotes": "metNotes", }
[docs]MET_MATLAB_TO_NOTES = {MET_NOTES_TO_MATLAB[k]: k for k in MET_NOTES_TO_MATLAB.keys()}
[docs]RXN_MATLAB_TO_PROVIDERS = { "rxnECNumbers": "ec-code", "rxnReferences": "pubmed", "rxnKEGGID": "kegg.reaction", "rxnKEGGPathways": "kegg.pathway", "rxnMetaNetXID": "metanetx.reaction", "rxnSEEDID": "seed.reaction", "rxnBiGGID": "bigg.reaction", "rxnBioCycID": "biocyc", "rxnRheaID": "rhea", "rxnReactomeID": "reactome", "rxnSABIORKID": "sabiork.reaction", "rxnBRENDAID": "brenda", "rxnSBOTerms": "sbo", }
[docs]RXN_PROVIDERS_TO_MATLAB = { RXN_MATLAB_TO_PROVIDERS[k]: k for k in RXN_MATLAB_TO_PROVIDERS.keys() }
[docs]CONFIDENCE_STR = "Confidence Level"
[docs]RXN_MATLAB_TO_NOTES = { "rxnReferences": "References", "rxnNotes": "NOTES", "rxnConfidenceScores": CONFIDENCE_STR, }
[docs]RXN_NOTES_TO_MATLAB = { CONFIDENCE_STR: "rxnConfidenceScores", "NOTES": "rxnNotes", "References": "rxnNotes", }
[docs]GENE_MATLAB_TO_PROVIDERS = { "geneEntrezID": "ncbigene", "geneRefSeqID": "refseq", "geneUniprotID": "uniprot", "geneEcoGeneID": "ecogene", "geneKEGGID": "kegg.gene", "geneHPRDID": "hprd", "geneASAPID": "asap", "geneCCDSID": "ccds", "geneNCBIProteinID": "ncbiprotein", }
[docs]GENE_PROVIDERS_TO_MATLAB = { GENE_MATLAB_TO_PROVIDERS[k]: k for k in GENE_MATLAB_TO_PROVIDERS.keys() }
[docs]DICT_GENE = "DICT_GENE"
[docs]DICT_GENE_REV = "DICT_GENE_REV"
[docs]DICT_MET = "DICT_MET"
[docs]DICT_MET_REV = "DICT_MET_REV"
[docs]DICT_MET_NOTES = "DICT_MET_NOTES"
[docs]DICT_MET_NOTES_REV = "DICT_MET_NOTES_REV"
[docs]DICT_REACTION = "DICT_REACTION"
[docs]DICT_REACTION_REV = "DICT_REACTION_REV"
[docs]DICT_REACTION_NOTES = "DICT_REACTION_NOTES"
[docs]DICT_REACTION_NOTES_REV = "DICT_REACTION_NOTES_REV"
[docs]DICT_REPLACE: dict = { DICT_GENE: GENE_MATLAB_TO_PROVIDERS, DICT_GENE_REV: GENE_PROVIDERS_TO_MATLAB, DICT_MET: MET_MATLAB_TO_PROVIDERS, DICT_MET_REV: MET_PROVIDERS_TO_MATLAB, DICT_MET_NOTES: MET_MATLAB_TO_NOTES, DICT_MET_NOTES_REV: MET_NOTES_TO_MATLAB, DICT_REACTION: RXN_MATLAB_TO_PROVIDERS, DICT_REACTION_REV: RXN_PROVIDERS_TO_MATLAB, DICT_REACTION_NOTES: RXN_MATLAB_TO_NOTES, DICT_REACTION_NOTES_REV: RXN_NOTES_TO_MATLAB, }
# precompiled regular expressions (kept globally for caching)
[docs]_bracket_re = re.compile(r"[\[(](?P<compartment>[a-zA-Z]+)[])]$")
# Some older models have _boundary for (some) exchange reactions
[docs]_underscore_re = re.compile(r"_(?P<compartment>[a-zA-Z]+)(_boundary)?$")
[docs]_pubmed_re = re.compile("PMID: ?(\\d+),?")
[docs]_punctuation_re = re.compile(r"^[;,.'\"]+$")
[docs]_double_punctuation_re = re.compile(r"[;,.'\"]{2,}")
[docs]_ec_re = re.compile(r"([\d\-]+.[\d\-]+.[\d\-]+.[\d-]+)")
[docs]_chebi_re = re.compile(r"\D*(\d+),?")
[docs]_sbo_re = re.compile(r"\D*(\d+),?")
[docs]def _get_id_compartment(_id: str) -> str: """Extract the compartment from the `id` string. Parameters ---------- _id : str The ID string to extract component from. Returns ------- str The extracted component string. """ bracket_search = _bracket_re.search(_id) if bracket_search: return bracket_search.group("compartment") underscore_search = _underscore_re.search(_id) if underscore_search: return underscore_search.group("compartment")
[docs]def _cell(x: Iterable[str]) -> np.ndarray: """Translate an iterable `x` into a MATLAB cell array. Parameters ---------- x : iterable of str The data iterable to convert to cell array. Returns ------- numpy.ndarray The converted cell array compatible with MATLAB. """ x_no_none = [i if i is not None else "" for i in x] return np.array(x_no_none, dtype=object)
[docs]def _cell_to_str_list( m_cell: np.ndarray, empty_value: Optional[str] = None, pattern_split: Optional[Pattern] = None, str_prefix: str = "", ) -> List: """Turn an ndarray (cell) to a list of strings. Parameters ---------- m_cell: np.ndarray empty_value: str, optional What value to replace empty cells with. Default None. pattern_split: Pattern, optional Regular expression to use to split the expression. Used for annotations. Default None. str_prefix: str, optional A prefix that will be added to each value in the list if present. Default "". Returns ------- List A list of processed strings. """ if str_prefix and pattern_split: return [ [ str_prefix + str_found if str_prefix not in str_found else str_found for str_found in pattern_split.findall(str(each_cell[0][0])) ] if np.size(each_cell[0]) else empty_value for each_cell in m_cell ] elif pattern_split: return [ pattern_split.findall(str(each_cell[0][0])) if np.size(each_cell[0]) else empty_value for each_cell in m_cell ] else: return [ str(each_cell[0][0]).strip() if np.size(each_cell[0]) else empty_value for each_cell in m_cell ]
[docs]def _cell_to_float_list( m_cell: np.ndarray, empty_value: Optional[float] = None, inf_value: Optional[float] = None, ) -> List: """Turn an ndarray (cell) to a list of floats. Parameters ---------- m_cell: np.ndarray empty_value: float, optional What value to replace empty cells with. Default None. inf_value: float, optional Replace infinite values with defined inf. Default None (don't replace), will be replaced by given value if given. Returns ------- List A list of processed floats. """ def fix_inf(val: str, _inf_value: float): """Fix inf value, used for rxn.lb and rxn.ub. Parameters ---------- val: str the str to process into float _inf_value: float A value to replace infite values with. """ val = float(val) if np.isinf(val) and val < 0: return -_inf_value elif np.isinf(val) and val > 0: return _inf_value else: return val if inf_value: return [ fix_inf(x[0], _inf_value=inf_value) if np.size(x[0]) else empty_value for x in m_cell ] else: return [float(x[0]) if np.size(x[0]) else empty_value for x in m_cell]
[docs]def load_matlab_model( infile_path: Union[str, Path, IO], variable_name: Optional[str] = None, inf: float = np.inf, ) -> Model: """Load a cobra model stored as a .mat file. Parameters ---------- infile_path : str or Path or filehandle File path or descriptor of the .mat file describing the cobra model. variable_name : str, optional The variable name of the model in the .mat file. If None, then the first MATLAB variable which looks like a COBRA model will be used (default None). inf: float, optional The value to use for infinite bounds. Some solvers do not handle infinite values so for using those, set this to a high numeric value (default `numpy.inf`). Returns ------- cobra.Model The cobra model as represented in the .mat file. Raises ------ ImportError If scipy is not found in the Python environment. IOError If no COBRA model is found in the .mat file. """ if not scipy_io: raise ImportError("load_matlab_model() requires scipy.") if isinstance(infile_path, str): data = scipy_io.loadmat(infile_path) elif isinstance(infile_path, Path): data = scipy_io.loadmat(infile_path.open("rb")) # noqa W9018 else: data = scipy_io.loadmat(infile_path) # noqa W9018 possible_names = [] if variable_name is None: # skip meta variables meta_vars = {"__globals__", "__header__", "__version__"} possible_names = sorted(i for i in data if i not in meta_vars) if len(possible_names) == 1: variable_name = possible_names[0] elif variable_name is not None: return from_mat_struct(data[variable_name], model_id=variable_name, inf=inf) for possible_name in possible_names: try: return from_mat_struct(data[possible_name], model_id=possible_name, inf=inf) except ValueError as e: print(f"Some problem with the model, causing error {e}") # TODO: use custom cobra exception to handle exception # If code here is executed, then no model was found. raise IOError(f"No COBRA model found at {infile_path}.")
[docs]def save_matlab_model( model: Model, file_name: Union[str, Path, IO], varname: Optional[str] = None ) -> None: """Save the cobra model as a .mat file. This .mat file can be used directly in cobratoolbox. Parameters ---------- model : cobra.Model The cobra model to represent. file_name : str or file-like or Path File path or descriptor that the MATLAB representation should be written to. varname : str, optional The name of the variable within the MATLAB workspace. Model ID is used if available, else 'exported_model' is used (default None). """ if not scipy_io: raise ImportError("save_matlab_model() requires scipy.") if varname is None: varname = ( str(model.id) if model.id is not None and len(model.id) > 0 else "exported_model" ) mat = create_mat_dict(model) scipy_io.savemat(file_name, {varname: mat}, appendmat=True, oned_as="column")
[docs]def mat_parse_annotations( target_list: List[Object], mat_struct: np.ndarray, d_replace: str = DICT_MET ) -> None: """Process mat structure annotations in place. Will process mat structured annotations and add them to a list of new entities (metabolites, reactions, genes) in a format based on identifiers.org. Parameters ---------- target_list: list[cobra.Object] A list of cobra objects, including metabolites, reactions or genes. The annotations will be added to these lists. mat_struct: np.ndarray A darray that includes the data imported from matlab file. d_replace: str A string that points to the dictionary of converstions between MATLAB and providers. Default DICT_MET (for metabolite). """ struct_names = [x.casefold() for x in mat_struct.dtype.names] matlab_field_dict = { x.casefold(): DICT_REPLACE[d_replace][x] for x in DICT_REPLACE[d_replace].keys() } caseunfold = {x.casefold(): x for x in mat_struct.dtype.names} annotation_matlab = list(set(struct_names).intersection(matlab_field_dict.keys())) providers = [matlab_field_dict[x] for x in annotation_matlab] annotations = dict.fromkeys(providers, None) for name, mat_key in zip(providers, annotation_matlab): if mat_key == "rxnReferences".casefold(): # This only picks up PMID: style references. Sometimes there are other # things like PMC or OMIM, but those are ignored for now, annotations[name] = _cell_to_str_list( mat_struct[caseunfold[mat_key]][0, 0], None, _pubmed_re ) elif mat_key == "rxnECNumbers".casefold(): # turn EC codes to a list annotations[name] = _cell_to_str_list( mat_struct[caseunfold[mat_key]][0, 0], None, _ec_re ) elif mat_key == "metCHEBIID".casefold(): annotations[name] = _cell_to_str_list( mat_struct[caseunfold[mat_key]][0, 0], None, _chebi_re, "CHEBI:" ) elif mat_key == "metSBOTerms".casefold() or mat_key == "rxnSBOTerms".casefold(): annotations[name] = _cell_to_str_list( mat_struct[caseunfold[mat_key]][0, 0], None, _sbo_re, "SBO:" ) elif mat_key == "metSLMID".casefold(): annotations[name] = _cell_to_str_list( mat_struct[caseunfold[mat_key]][0, 0], None, _sbo_re, "SLM:" ) else: # If it something else, which may have commas, turn it into a list annotations[name] = [ [y.strip() for y in each_cell.split(", ")] if each_cell else None for each_cell in _cell_to_str_list( mat_struct[caseunfold[mat_key]][0, 0] ) ] for i, obj in enumerate(target_list): obj.annotation = { prov: annotations[prov][i] for prov in providers if annotations[prov][i] }
# TODO - When cobrapy.notes are revised not be a dictionary (possibly when # annotations are fully SBML compliant, revise this function.
[docs]def mat_parse_notes( target_list: List[Object], mat_struct: np.ndarray, d_replace: str = DICT_REACTION_NOTES, ) -> None: """Process mat structure notes in place. Will process mat structured notes and add them to a list of new entities (metabolites, reactions, genes) in a format based on identifiers.org. Parameters ---------- target_list: list[cobra.Object] A list of cobra objects, including metabolites, reactions or genes. The notes will be added to these lists. mat_struct: np.ndarray A darray that includes the data imported from matlab file. d_replace: str A string that points to the dictionary of converstions between MATLAB and notes. Default DICT_REACTION_NOTES (for reactions). """ struct_names = [x.casefold() for x in mat_struct.dtype.names] matlab_field_dict = { x.casefold(): DICT_REPLACE[d_replace][x] for x in DICT_REPLACE[d_replace].keys() } caseunfold = {x.casefold(): x for x in mat_struct.dtype.names} annotation_matlab = list(set(struct_names).intersection(matlab_field_dict.keys())) note_providers = [matlab_field_dict[x] for x in annotation_matlab] notes = dict.fromkeys(note_providers, None) for name, mat_key in zip(note_providers, annotation_matlab): if mat_key == "rxnReferences".casefold(): # This only removes PMID: style references. Sometimes there are other # things like PMC or OMIM, but those are placed as string in notes. _notes = _cell_to_str_list(mat_struct[caseunfold[mat_key]][0, 0]) notes[name] = [ _pubmed_re.sub("", x).strip() if x and len(_pubmed_re.sub("", x).strip()) else None for x in _notes ] elif mat_key == "rxnConfidenceScores".casefold(): notes[name] = [ str(confidence) if confidence is not None else "" for confidence in _cell_to_float_list( mat_struct[caseunfold[mat_key]][0, 0] ) ] else: # If it something else, which may have commas, turn it into a list notes[name] = _cell_to_str_list(mat_struct[caseunfold[mat_key]][0, 0]) notes[name] = [ _punctuation_re.sub("", _double_punctuation_re.sub("", x)) if x else None for x in notes[name] ] for i, obj in enumerate(target_list): obj.notes = {prov: notes[prov][i] for prov in note_providers if notes[prov][i]}
[docs]def annotations_to_mat( mat_dict: OrderedDict, annotation_list: List[Dict], d_replace: str = DICT_MET_REV ) -> None: """Process mat structure annotations in place. Will process mat structured annotations and add them to a list of new entities (metabolites, reactions, genes) in a format based on identifiers.org. Parameters ---------- mat_dict: OrderedDict An ordered dictionary having model attributes as keys and their respective values represented as arrays, as the values. Annotations will be inserted into this OrderdDict. annotation_list: list[Dict] A list of cobra annotations, in the form of a dictionary. d_replace: str A string that points to the dictionary of converstions between MATLAB and providers. Default DICT_MET_REV (for metabolite). """ providers_used = set() for i in range(len(annotation_list)): providers_used.update(annotation_list[i].keys()) providers_used = providers_used.intersection(DICT_REPLACE[d_replace].keys()) providers_used = list(providers_used) annotation_matlab = {prov: DICT_REPLACE[d_replace][prov] for prov in providers_used} empty_lists = [[""] * len(annotation_list) for _ in annotation_matlab] annotation_cells_to_be = dict(zip(annotation_matlab.values(), empty_lists)) for i in range(len(annotation_list)): for provider_key, obj_annotation in annotation_list[i].items(): if isinstance(obj_annotation, str): obj_annotation = [obj_annotation] if provider_key == "pubmed": obj_annotation = ", ".join( [ "PMID:" + annot if "PMID:" not in annot else annot for annot in obj_annotation ] ) elif provider_key == "CHEBI": obj_annotation = ", ".join( [ "CHEBI:" + annot if "CHBEI:" not in annot else annot for annot in obj_annotation ] ) elif provider_key == "ec-code": obj_annotation = " or ".join(obj_annotation) else: obj_annotation = ", ".join(obj_annotation) if provider_key not in providers_used: continue annotation_cells_to_be[annotation_matlab[provider_key]][i] = obj_annotation for annotation_key, item_list in annotation_cells_to_be.items(): mat_dict[annotation_key] = _cell(item_list)
# TODO - When cobrapy.notes are revised not be a dictionary (possibly when # annotations are fully SBML compliant, revise this function.
[docs]def notes_to_mat( mat_dict: OrderedDict, note_list: List[Dict], d_replace: str = DICT_MET_REV ) -> None: """Process mat structure notes in place. Will process mat structured annotations and add them to a list of new entities (metabolites, reactions, genes) in a format based on identifiers.org. Parameters ---------- mat_dict: OrderedDict An ordered dictionary having model attributes as keys and their respective values represented as arrays, as the values. Annotations will be inserted into this OrderdDict. note_list: list[Dict] A list of cobra annotations, in the form of a dictionary. d_replace: str A string that points to the dictionary of converstions between MATLAB and providers. Default DICT_MET_REV (for metabolite). """ providers_used = set() for i in range(len(note_list)): providers_used.update(note_list[i].keys()) providers_used = providers_used.intersection(DICT_REPLACE[d_replace].keys()) providers_used = list(providers_used) annotation_matlab = {prov: DICT_REPLACE[d_replace][prov] for prov in providers_used} empty_lists = [[""] * len(note_list) for _ in annotation_matlab] annotation_cells_to_be = dict(zip(annotation_matlab.values(), empty_lists)) for i in range(len(note_list)): for provider_key, object_note in note_list[i].items(): if provider_key not in providers_used: continue if provider_key == CONFIDENCE_STR: object_note = float(object_note) if not len(annotation_cells_to_be[annotation_matlab[provider_key]][i]): annotation_cells_to_be[annotation_matlab[provider_key]][i] = object_note else: # References that aren't MIRIAM compliant will go to rxnNotes annotation_cells_to_be[annotation_matlab[provider_key]][i] = ( annotation_cells_to_be[annotation_matlab[provider_key]][i] + f"; {object_note}" ) for annotation_key, item_list in annotation_cells_to_be.items(): mat_dict[annotation_key] = _cell(item_list)
[docs]def create_mat_dict(model: Model) -> OrderedDict: """Create a dictionary mapping model attributes to arrays. Parameters ---------- model : cobra.Model The model to create dictionary for. Returns ------- OrderedDict The ordered dictionary having model attributes as keys and their respective values represented as arrays, as the values. """ rxns = model.reactions mets = model.metabolites mat = OrderedDict() mat["mets"] = _cell(mets.list_attr("id")) model_has_compartment_names = False if list(model.compartments.keys()) != list(model.compartments.values()): model_has_compartment_names = True if {_get_id_compartment(met_id) for met_id in mets.list_attr("id")} == { None } or model_has_compartment_names: comps = list(model.compartments.keys()) mat["comps"] = _cell(comps) mat["compNames"] = _cell([model.compartments[comp] for comp in comps]) mat["metComps"] = [comps.index(x) + 1 for x in mets.list_attr("compartment")] mat["metNames"] = _cell(mets.list_attr("name")) mat["metFormulas"] = _cell([str(m.formula) if m.formula else "" for m in mets]) try: mat["metCharges"] = np.array(mets.list_attr("charge")) * 1.0 except (TypeError, AttributeError): # can't have any None entries for charge, or this will fail # TODO: use custom cobra exception to handle exception pass annotations_to_mat(mat, mets.list_attr("annotation"), DICT_MET_REV) # TODO - When cobrapy.notes are revised not be a dictionary (possibly when # annotations are fully SBML compliant, revise this function. notes_to_mat(mat, mets.list_attr("notes"), DICT_MET_NOTES_REV) mat["genes"] = _cell(model.genes.list_attr("id")) gene_names = model.genes.list_attr("name") if not all(_name == "" for _name in gene_names): mat["geneNames"] = _cell(gene_names) annotations_to_mat(mat, model.genes.list_attr("annotation"), DICT_GENE_REV) # make a matrix for rxnGeneMat # reactions are rows, genes are columns rxn_gene = scipy_sparse.dok_matrix((len(model.reactions), len(model.genes))) if min(rxn_gene.shape) > 0: for i, reaction in enumerate(model.reactions): for gene in reaction.genes: rxn_gene[i, model.genes.index(gene)] = 1 mat["rxnGeneMat"] = rxn_gene mat["grRules"] = _cell(rxns.list_attr("gene_reaction_rule")) mat["rxns"] = _cell(rxns.list_attr("id")) mat["rxnNames"] = _cell(rxns.list_attr("name")) annotations_to_mat(mat, rxns.list_attr("annotation"), DICT_REACTION_REV) notes_to_mat(mat, rxns.list_attr("notes"), DICT_REACTION_NOTES_REV) rxn_subsystems = _cell(rxns.list_attr("subsystem")) group_ids = model.groups.list_attr("id") group_names = model.groups.list_attr("name") group_ids = [ _id if not _name else _name for _id, _name in zip(group_ids, group_names) ] group_members_list = model.groups.list_attr("members") if set(rxn_subsystems) == {""} and set(group_ids) != {""}: subsystems = [[] for _ in rxns] for group_id, group_members in zip(group_ids, group_members_list): group = model.groups.get_by_id(group_id) if group.kind == "partonomy": for member in group_members: if isinstance(member, Reaction): rxn_ind = model.reactions.index(member) # noinspection PyTypeChecker subsystems[rxn_ind].append(group_id) mat["subSystems"] = _cell( [", ".join(subsystem_list) for subsystem_list in subsystems] ) else: mat["subSystems"] = _cell(rxns.list_attr("subsystem")) stoich_mat = create_stoichiometric_matrix(model, array_type="dok") mat["S"] = stoich_mat if stoich_mat is not None else [[]] # multiply by 1 to convert to float, working around scipy bug # https://github.com/scipy/scipy/issues/4537 mat["lb"] = np.array(rxns.list_attr("lower_bound")) * 1.0 mat["ub"] = np.array(rxns.list_attr("upper_bound")) * 1.0 mat["b"] = np.array(mets.list_attr("_bound")) * 1.0 mat["c"] = np.array(rxns.list_attr("objective_coefficient")) * 1.0 mat["rev"] = np.array(rxns.list_attr("reversibility")) * 1 if model.name: mat["modelName"] = str(model.name) mat["description"] = str(model.id) return mat
[docs]def from_mat_struct( mat_struct: np.ndarray, model_id: Optional[str] = None, inf: float = np.inf, ) -> Model: """Create a model from the cobratoolbox struct. Parameters ---------- mat_struct : numpy.ndarray The `numpy.ndarray` that most likely contains the model, being chosen by load_matlab_file after loading the matlab structure via scipy_io.loadmat. model_id : str, optional The ID of the model generated. If None, will try to look for ID in model's description. If multiple IDs are found, the first one is used. If no IDs are found, will use 'imported_model' (default None). inf : float, optional The value to use for infinite bounds. Some solvers do not handle infinite values so for using those, set this to a high numeric value (default `numpy.inf`). Returns ------- cobra.Model The model as represented in .mat file. """ m = mat_struct if m.dtype.names is None or {"rxns", "mets", "S", "lb", "ub"} > set(m.dtype.names): raise ValueError("Invalid MATLAB struct.") old_cobratoolbox_fields = [ "confidenceScores", "metCharge", "ecNumbers", "KEGGID", "metSmile", "metHMDB", ] new_cobratoolbox_fields = [ "rxnConfidenceScores", "metCharges", "rxnECNumbers", "metKEGGID", "metSmiles", "metHMDBID", ] for old_field, new_field in zip(old_cobratoolbox_fields, new_cobratoolbox_fields): if old_field in m.dtype.names and new_field not in m.dtype.names: logger.warning( f"This model seems to have {old_field} instead of {new_field} field. " f"Will use {old_field} for what {new_field} represents." ) new_names = list(m.dtype.names) new_names[new_names.index(old_field)] = new_field m.dtype.names = new_names model = Model() if model_id is not None: model.id = model_id elif "description" in m.dtype.names: description = m["description"][0, 0][0] if not isinstance(description, str) and len(description) > 1: model.id = description[0] logger.warning("Several IDs detected, only using the first.") else: model.id = description else: model.id = "imported_model" if "modelName" in m.dtype.names and np.size(m["modelName"][0, 0]): model.name = m["modelName"][0, 0][0] met_ids = _cell_to_str_list(m["mets"][0, 0]) if {"metComps", "comps", "compNames"}.issubset(m.dtype.names): met_comp_index = [x[0] - 1 for x in m["metComps"][0][0]] comps = _cell_to_str_list(m["comps"][0, 0]) comp_names = _cell_to_str_list(m["compNames"][0][0]) met_comps = [comps[i] for i in met_comp_index] met_comp_names = [comp_names[i] for i in met_comp_index] else: logger.warning( f"No defined compartments in model {model.id}. " f"Compartments will be deduced heuristically " f"using regular expressions." ) met_comps = [_get_id_compartment(x) for x in met_ids] met_comp_names = met_comps if None in met_comps or "" in met_comps: raise ValueError("Some compartments were empty. Check the model!") logger.warning( f"Using regular expression found the following compartments:" f"{', '.join(sorted(set(met_comps)))}" ) if None in met_comps or "" in met_comps: raise ValueError("Some compartments were empty. Check the model!") model.compartments = dict(zip(met_comps, met_comp_names)) met_names, met_formulas, met_charges = None, None, None try: met_names = _cell_to_str_list(m["metNames"][0, 0], "") except (IndexError, ValueError): # TODO: use custom cobra exception to handle exception pass try: met_formulas = _cell_to_str_list(m["metFormulas"][0, 0]) except (IndexError, ValueError): # TODO: use custom cobra exception to handle exception pass try: met_charges = _cell_to_float_list(m["metCharges"][0, 0]) except (IndexError, ValueError): # TODO: use custom cobra exception to handle exception pass new_metabolites = [] for i in range(len(met_ids)): new_metabolite = Metabolite(met_ids[i], compartment=met_comps[i]) if met_names: new_metabolite.name = met_names[i] if met_charges: new_metabolite.charge = met_charges[i] if met_formulas: new_metabolite.formula = met_formulas[i] new_metabolites.append(new_metabolite) mat_parse_annotations(new_metabolites, m, d_replace=DICT_MET) mat_parse_notes(new_metabolites, m, d_replace=DICT_MET_NOTES) model.add_metabolites(new_metabolites) if "genes" in m.dtype.names: gene_names = None gene_ids = _cell_to_str_list(m["genes"][0, 0]) try: gene_names = _cell_to_str_list(m["geneNames"][0, 0]) except (IndexError, ValueError): # TODO: use custom cobra exception to handle exception pass new_genes = [ Gene(gene_ids[i], name=gene_names[i]) if gene_names else Gene(gene_ids[i]) for i in range(len(gene_ids)) ] mat_parse_annotations(new_genes, m, d_replace=DICT_GENE) for current_gene in new_genes: current_gene._model = model model.genes += new_genes new_reactions = [] rxn_ids = _cell_to_str_list(m["rxns"][0, 0]) rxn_lbs = _cell_to_float_list(m["lb"][0, 0], empty_value=None, inf_value=inf) rxn_ubs = _cell_to_float_list(m["ub"][0, 0], empty_value=None, inf_value=inf) rxn_gene_rules, rxn_names, rxn_subsystems = None, None, None try: rxn_gene_rules = _cell_to_str_list(m["grRules"][0, 0], "") except (IndexError, ValueError): # TODO: use custom cobra exception to handle exception pass try: rxn_names = _cell_to_str_list(m["rxnNames"][0, 0], "") except (IndexError, ValueError): # TODO: use custom cobra exception to handle exception pass try: # RECON3.0 mat has an array within an array for subsystems. # If we find a model that has multiple subsytems per reaction, this should be # modified if np.sctype2char(m["subSystems"][0, 0][0][0]) == "O" and isinstance( m["subSystems"][0, 0][0][0][0], np.ndarray ): rxn_subsystems = [ each_cell[0][0][0][0] if each_cell else "" for each_cell in m["subSystems"][0, 0] ] # Other matlab files seem normal. else: rxn_subsystems = _cell_to_str_list(m["subSystems"][0, 0], "") except (IndexError, ValueError): # TODO: use custom cobra exception to handle exception pass for i in range(len(rxn_ids)): new_reaction = Reaction( id=rxn_ids[i], lower_bound=rxn_lbs[i], upper_bound=rxn_ubs[i] ) if rxn_names: new_reaction.name = rxn_names[i] if rxn_subsystems: new_reaction.subsystem = rxn_subsystems[i] if rxn_gene_rules: new_reaction.gene_reaction_rule = rxn_gene_rules[i] new_reactions.append(new_reaction) mat_parse_annotations(new_reactions, m, d_replace=DICT_REACTION) # TODO - When cobrapy.notes are revised not be a dictionary (possibly when # annotations are fully SBML compliant, revise this function. mat_parse_notes(new_reactions, m, d_replace=DICT_REACTION_NOTES) csc = scipy_sparse.csc_matrix(m["S"][0, 0]) for i in range(csc.shape[1]): stoic_dict = { model.metabolites[j]: csc[j, i] for j in csc.getcol(i).nonzero()[0] } new_reactions[i].add_metabolites(stoic_dict) model.add_reactions(new_reactions) # Make subsystems into groups if rxn_subsystems: rxn_group_names = set(rxn_subsystems).difference({None}) new_groups = [] for g_name in sorted(rxn_group_names): group_members = model.reactions.query( lambda x: x.subsystem == g_name # noqa: B023 ) new_group = Group( id=g_name, name=g_name, members=group_members, kind="partonomy" ) new_group.annotation["sbo"] = "SBO:0000633" new_groups.append(new_group) model.add_groups(new_groups) if "c" in m.dtype.names: c_vec = _cell_to_float_list(m["c"][0, 0]) coefficients = dict(zip(new_reactions, c_vec)) set_objective(model, coefficients) else: logger.warning("Objective vector `c` not found.") if "osenseStr" in m.dtype.names: if isinstance(m["osenseStr"][0, 0][0], np.str_): model.objective_direction = str(m["osenseStr"][0, 0][0]) elif isinstance(m["osenseStr"][0, 0][0], np.ndarray): model.objective_direction = str(m["osenseStr"][0, 0][0][0]) elif "osense" in m.dtype.names: osense = float(m["osense"][0, 0][0][0]) objective_direction_str = "max" if osense == 1: objective_direction_str = "min" model.objective_direction = objective_direction_str return model