Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Plot stacked bar of file types in artifact #20

Merged
merged 4 commits into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions app_config.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,12 @@
"description": "Whether to enable the file path search feature.\nThe file path search feature sends information about your query to an API provided by Quansight.",
"title": "Enable Filepath Search",
"type": "boolean"
},
"enable_filetype_plot": {
"default": true,
"description": "Whether to enable Altair charts to plot a basic path-based analysis of the files included in\neach artifact.",
"title": "Enable Filetype Plot",
"type": "boolean"
}
},
"required": [
Expand Down
1 change: 1 addition & 0 deletions app_config.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#:schema ./app_config.schema.json
enable_filepath_search = true
enable_filetype_plot = true

[channels.conda-forge]
url = "https://conda.anaconda.org/conda-forge"
Expand Down
6 changes: 6 additions & 0 deletions conda_metadata_app/app_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,12 @@ def validate_channels(cls, channels: dict[str, Channel]) -> dict[str, Channel]:
The file path search feature sends information about your query to an API provided by Quansight.
"""

enable_filetype_plot: bool = True
"""
Whether to enable Altair charts to plot a basic path-based analysis of the files included in
each artifact.
"""

@model_validator(mode="after")
def _validate_dashboards(self) -> Self:
for channel in self.channels.values():
Expand Down
95 changes: 94 additions & 1 deletion conda_metadata_app/pages/main_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
"""

import json
import mimetypes
import os
import re
import typing
from collections import defaultdict
from contextlib import closing
from datetime import datetime
from difflib import unified_diff
Expand Down Expand Up @@ -66,6 +68,56 @@
initial_sidebar_state="expanded",
)

EXTENSION_TO_CATEGORY = {
".a": "Library",
".bat": "Shell",
".cmd": "Shell",
".csh": "Shell",
".dll": "Library",
".dylib": "Library",
".exe": "Executable",
".fish": "Shell",
".go": "Go",
".h": "Headers",
".hpp": "Headers",
".lib": "Library",
".pm": "Perl",
".ps1": "Shell",
".psm1": "Shell",
".pyi": "Python",
".pyd": "Library",
".rs": "Rust",
".sh": "Shell",
".so": "Library",
".xsh": "Shell",
".zsh": "Shell",
}
MIMETYPE_TO_CATEGORY = {
None: "Other",
"application/java-vm": "Java",
"application/javascript": "JavaScript",
"application/json": "JSON",
"application/octet-stream": "Binary",
"application/pdf": "PDF",
"application/vnd.ms-fontobject": "Fonts",
"application/x-font-type1": "Fonts",
"application/x-python-code": "Python",
"application/x-tar": "Archives",
"application/x-tcl": "TCL",
"application/x-tgif": "Multimedia",
"application/xml": "XML",
"application/zip": "Archives",
"text/css": "CSS",
"text/csv": "CSV",
"text/html": "HTML",
"text/markdown": "Markdown",
"text/plain": "Text",
"text/x-c": "C",
"text/x-fortran": "Fortran",
"text/x-perl": "Perl",
"text/x-python": "Python",
}


def bar_esc(s: str) -> str:
"Escape vertical bars in tables"
Expand Down Expand Up @@ -591,6 +643,40 @@ def _is_broken(
raise RuntimeError("Invalid artifact discovery choice. This is an implementation error.")


def _categorize_path(path: str) -> str:
ext = os.path.splitext(path)[1].lower()
components = path.split("/")
if not ext and "bin" in components[:-1]:
return "Executable"
if category := EXTENSION_TO_CATEGORY.get(ext):
return category
mimetype, _ = mimetypes.guess_type(path)
if category := MIMETYPE_TO_CATEGORY.get(mimetype):
return category
first, second = mimetype.split("/")
if first == "font":
return "Fonts"
if first in ("image", "audio", "video"):
return "Multimedia"
if ".so." in components[-1]:
return "Library"
if not ext and "LICENSE" in components[-1]:
return "Text"
if components[0] == "man" and ext[1:].isdigit():
return "Text"
return mimetype


def _content_analysis_plot(paths: list[str]):
if not app_config().enable_filetype_plot:
return
counter = defaultdict(int)
for path in paths:
counter[_categorize_path(path)] += 1
counter = dict(sorted(counter.items(), key=lambda kv: kv[1], reverse=True))
return st.bar_chart([counter], horizontal=True, stack="normalize")


def patched_repodata(channel: str, subdir: str, artifact: str) -> tuple[dict, bool]:
"""
This function assumes that the artifact discovery mode for the channel is "anaconda".
Expand Down Expand Up @@ -1095,7 +1181,14 @@ def disable_button(query):

if data.get("files"):
st.write("### Files")
all_files = "\n".join(data["files"])
_content_analysis_plot(data["files"])
if (n_files := len(data["files"])) > 10000:
st.info(
f"Too many files ({n_files}). Showing only first 10K. "
"Check raw JSON below for full list.",
icon="ℹ️",
)
all_files = "\n".join(data["files"][:10000])
st.code(all_files, language="text", line_numbers=True)

st.write("### Raw JSON")
Expand Down
Loading