Quick Start Guide
This guide will get you up and running with rxn_rdf_converter in minutes
Installation
pip install rxn_rdf_converter
Basic Usage
Process Multiple Datasets CLI
python rxn_rdf_converter all --dataset_root ../datasets/ --save_path ../save_path --onto_file_path ../onto_file_path --error_log_directory ../error_log_directory
Process Single Datasets CLI
python rxn_rdf_converter single-dataset dataset_file_path ../dataset_file_path --save_path ../save_path --onto_file_path ../onto_file_path --error_log_directory ../error_log_directory
Process Multiple Datasets
import ord_schema
from ord_schema.message_helpers import load_message, write_message, message_to_row
from ord_schema.proto import dataset_pb2, reaction_pb2
import os
from rdkit import Chem
import re
from owlready2 import get_namespace, get_ontology, Thing
import rdflib
from rdflib import Graph, RDF, RDFS, OWL, Namespace, Literal, URIRef
from rdflib.namespace import RDFS, XSD, URIRef, OWL, SKOS, PROV
import logging
import csv
import rxn_rdf_converter
from rxn_rdf_converter import DatasetProcessor
# Create logs directory if it doesn't exist
error_log_directory = '../error_logs'
os.makedirs(error_log_directory, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(os.path.join(error_log_directory, 'main.log')),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def main():
""" Main Executive Function """
try:
logging.info("Starting Data Processing....")
# set up path:
save_path = '../save_path'
onto_file_path = '../MDS-Onto.owl'
#logger.info(f"Found {len(file_list)} data files")
dataset_reaction_list = []
for dataset in input_df['file_list']:
try:
logger.info(f"Processing dataset {dataset}")
dataset_processor = rxn_rdf_converter.DatasetProcessor(
dataset_pb=dataset_pb2,
dataset_file_path=dataset,
owl_onto_file_path=onto_file_path,
output_directory=save_path,
error_log_directory=error_log_directory,
fmt='json-ld'
)
_, reaction_error, dataset_reaction_list = dataset_processor.extract_reaction(dataset_reaction_list)
logger.info(f"Successfully completed dataset {dataset}")
except Exception as e:
logger.error(f"Failed to process dataset {dataset} - Error: {e}")
finally:
# Clean up logger resources
if 'dataset_process' in locals():
dataset_processor.cleanup_logger()
csv_output_path = '/mnt/vstor/CSE_MSE_RXF131/staging/mds3/KG-ChemRxn/output_logs'
os.makedirs(csv_output_path, exist_ok=True)
# save the results
with open(os.path.join(csv_output_path, 'dataset_reactions.csv'), 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['dataset_id', 'reaction_id'])
writer.writerows(dataset_reaction_list)
logger.info(f"Data processing completed successfully")
except Exception as e:
logger.error(f"Error in main execution: {e}", exc_info=True)
if __name__ == '__main__':
main()
Process Individual Dataset
import ord_schema
from ord_schema.message_helpers import load_message, write_message, message_to_row
from ord_schema.proto import dataset_pb2, reaction_pb2
import os
from rdkit import Chem
import re
from owlready2 import get_namespace, get_ontology, Thing
import rdflib
from rdflib import Graph, RDF, RDFS, OWL, Namespace, Literal, URIRef
from rdflib.namespace import RDFS, XSD, URIRef, OWL, SKOS, PROV
import logging
import csv
import rxn_rdf_converter
from rxn_rdf_converter import DatasetProcessor
# =================================================================
# SETUP LOGGING ERRORS
# =================================================================
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('error.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# =================================================================
# INITIATE FILE PATH
# =================================================================
def setup_file_path(dataset_path):
""" Set up the file paths """
file_list = []
for root, dirs, files in os.walk(dataset_path):
for name in files:
if name.startswith('ord_dataset'):
file_path = os.path.join(root, name)
file_list.append(file_path)
return file_list
# =================================================================
# MAIN FUNCTION
# =================================================================
dataset_path = '../datasets/'
save_path = '../save_path'
onto_file_path = '../MDS-Onto.owl''
error_log_directory = '../error_log_directory'
try:
logging.info("Starting Data Processing....")
# set up path:
file_list, = setup_file_path(dataset_path)
logger.info(f"Found {len(file_list)} data files")
dataset_file_path = file_list[] # add in here the index of the dataset of interest
dataset_reaction_list = []
dataset_1 = rxn_rdf_converter.DatasetProcessor(
dataset_pb=dataset_pb2,
dataset_file_path=dataset_file_path,
owl_onto_file_path=mds_file_path,
output_directory=save_path,
fmt='json-ld'
)
_, reaction_error, dataset_reaction_list = dataset_1.extract_reaction(dataset_reaction_list)
# save the results
with open('dataset_reactions.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['dataset_id', 'reaction_id'])
writer.writerows(dataset_reaction_list)
print(f"Collected {len(dataset_reaction_list)} reaction mappings")
print("Saved to dataset_reactions.csv")
except Exception as e:
logger.error(f"Error in main execution: {e}", exc_info=True)
Process Individual Reaction
import ord_schema
from ord_schema.message_helpers import load_message, write_message, message_to_row
from ord_schema.proto import dataset_pb2, reaction_pb2
import os
from rdkit import Chem
import re
from owlready2 import get_namespace, get_ontology, Thing
import rdflib
from rdflib import Graph, RDF, RDFS, OWL, Namespace, Literal, URIRef
from rdflib.namespace import RDFS, XSD, URIRef, OWL, SKOS, PROV
import logging
import csv
import rxn_rdf_converter
from rxn_rdf_converter import ReactionKG
def setup_file_path(dataset_path):
""" Set up the file paths """
file_list = []
for root, dirs, files in os.walk(dataset_path):
for name in files:
if name.startswith('ord_dataset'):
file_path = os.path.join(root, name)
file_list.append(file_path)
return file_list
dataset_path = '../datasets/'
save_path = '../save_path'
onto_file_path = '../MDS-Onto.owl''
# Process all of the file paths of all of the datasets:
file_list = setup_file_path(dataset_path)
# Load one dataset into a Python variable by calling the dataset index into file_list[], all reactions will be generated in a dataset from ORD
dataset = load_message(file_list[], dataset_pb2.Dataset,)
# Process the reaction of interest by adding the index of the reaction in dataset.reactions[]
reaction_1 = rxn_rdf_converter.ReactionKG(dataset.reactions[], fmt="json-ld").generate_reaction().generate_instances(onto_file_path).generate_data_graph(dataset.dataset_id, save_path)
Next Steps
Read the full :doc: ‘rxn_rdf_converter’ documentation
Browse the :doc: ‘modules’ for detailed API reference
Check out more examples in the main documentation