Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compress nodes and edges before upload; remove interstitial files #88

Merged
merged 2 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/kg_bioportal/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,14 @@ def download(
@main.command()
@click.option("--input_dir", "-i", default="data/raw", type=click.Path(exists=True))
@click.option("--output_dir", "-o", default="data/transformed")
def transform(input_dir, output_dir) -> None:
@click.option(
"--compress",
"-c",
is_flag=True,
default=True,
help="If true, compresses the output nodes and edges to tar.gz. Defaults to True.",
)
def transform(input_dir, output_dir, compress) -> None:
"""Transforms all ontologies in the input directory to KGX nodes and edges.

Yields two log files: total_stats.yaml and onto_stats.yaml.
Expand All @@ -190,7 +197,7 @@ def transform(input_dir, output_dir) -> None:

tx = Transformer(input_dir=input_dir, output_dir=output_dir)

tx.transform_all()
tx.transform_all(compress=compress)

return None

Expand Down
33 changes: 28 additions & 5 deletions src/kg_bioportal/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import sys
import tarfile
from typing import Tuple

import yaml
Expand Down Expand Up @@ -55,15 +56,15 @@ def __init__(

return None

def transform_all(self) -> None:
def transform_all(self, compress: bool) -> None:
"""Transforms all ontologies in the input directory to KGX nodes and edges.

Yields two log files: total_stats.yaml and onto_stats.yaml.
The first contains the total counts of Bioportal ontologies and transforms.
The second contains the counts of nodes and edges for each ontology.

Args:
None.
compress: If True, compresses the output nodes and edges to tar.gz.

Returns:
None.
Expand Down Expand Up @@ -95,7 +96,7 @@ def transform_all(self) -> None:

for filepath in filepaths:
ontology_name = (os.path.relpath(filepath, self.input_dir)).split(os.sep)[0]
success, nodecount, edgecount = self.transform(filepath)
success, nodecount, edgecount = self.transform(filepath, compress)
if not success:
logging.error(f"Error transforming {filepath}.")
status = False
Expand Down Expand Up @@ -142,11 +143,12 @@ def transform_all(self) -> None:

return None

def transform(self, ontology_path: str) -> Tuple[bool, int, int]:
def transform(self, ontology_path: str, compress: bool) -> Tuple[bool, int, int]:
"""Transforms a single ontology to KGX nodes and edges.

Args:
ontology: A string of the path to the ontology file to transform.
ontology_path: A string of the path to the ontology file to transform.
compress: If True, compresses the output nodes and edges to tar.gz.

Returns:
Tuple of:
Expand Down Expand Up @@ -239,6 +241,27 @@ def transform(self, ontology_path: str) -> Tuple[bool, int, int]:
with open(edgefilename, "r") as f:
edgecount = len(f.readlines()) - 1

# Compress if requested
if compress:
logging.info("Compressing nodes and edges.")
with tarfile.open(f"{outfilename}.tar.gz", "w:gz") as tar:
tar.add(nodefilename, arcname=f"{ontology_name}_nodes.tsv")
tar.add(edgefilename, arcname=f"{ontology_name}_edges.tsv")

os.remove(nodefilename)
os.remove(edgefilename)

# Remove the owl files
# They may not exist if the transform failed
try:
os.remove(owl_output_path)
except OSError:
pass
try:
os.remove(relaxed_outpath)
except OSError:
pass

except Exception as e:
logging.error(
f"Error transforming {ontology_name} to KGX nodes and edges: {e}"
Expand Down
Loading