From 4c16dff54a6da929008888d1f86dbcb0c62d7312 Mon Sep 17 00:00:00 2001
From: ggmarshall <george.marshall.20@ucl.ac.uk>
Date: Wed, 12 Mar 2025 21:22:18 +0100
Subject: [PATCH 1/3] specify instreams and crate for aux

---
 .pre-commit-config.yaml                          |  2 +-
 .../src/legenddataflow/scripts/tier/raw_orca.py  | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8f713bef..074c838b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -78,7 +78,7 @@ repos:
     rev: "v2.3.0"
     hooks:
       - id: codespell
-        args: ["-L", "nd,unparseable,compiletime,livetime,fom,puls"]
+        args: ["-L", "nd,unparseable,compiletime,livetime,fom,puls,crate"]
 
   - repo: https://github.com/shellcheck-py/shellcheck-py
     rev: "v0.10.0.1"
diff --git a/workflow/src/legenddataflow/scripts/tier/raw_orca.py b/workflow/src/legenddataflow/scripts/tier/raw_orca.py
index 9ee5a0f1..c4f954b4 100644
--- a/workflow/src/legenddataflow/scripts/tier/raw_orca.py
+++ b/workflow/src/legenddataflow/scripts/tier/raw_orca.py
@@ -27,7 +27,7 @@ def build_tier_raw_orca() -> None:
 
     configs = TextDB(args.configs, lazy=True)
     config_dict = configs.on(args.timestamp, system=args.datatype)["snakemake_rules"][
-        "tier_raw"
+        "tier_raw_orca"
     ]
 
     build_log(config_dict, args.log)
@@ -63,11 +63,25 @@ def build_tier_raw_orca() -> None:
         spm_config[next(iter(spm_config))]["spms"]["key_list"] = sorted(spm_channels)
         Props.add_to(all_config, spm_config)
 
+    if "muon_config" in list(channel_dict):
+        muon_config = Props.read_from(channel_dict["muon_config"])
+        muon_channels = list(
+            chmap.channelmaps.on(args.timestamp)
+            .map("system", unique=False)["muon"]
+            .map("daq.rawid")
+        )
+        top_key = next(iter(muon_config))
+        muon_config[top_key][next(iter(muon_config[top_key]))]["key_list"] = sorted(
+            muon_channels
+        )
+        Props.add_to(all_config, muon_config)
+
     if "auxs_config" in list(channel_dict):
         aux_config = Props.read_from(channel_dict["auxs_config"])
         aux_channels = list(
             chmap.channelmaps.on(args.timestamp)
             .map("system", unique=False)["auxs"]
+            .map("daq.crate", unique=False)[1]
             .map("daq.rawid")
         )
         aux_channels += list(

From 43242ae2fa0cf93b5db7562ad2191096c0769020 Mon Sep 17 00:00:00 2001
From: ggmarshall <george.marshall.20@ucl.ac.uk>
Date: Wed, 12 Mar 2025 21:42:54 +0100
Subject: [PATCH 2/3] add rules for decoding gzip and bzip orca files

---
 workflow/rules/raw.smk                        | 56 +++++++++++++++++++
 workflow/src/legenddataflow/FileKey.py        |  7 ++-
 .../src/legenddataflow/create_pars_keylist.py |  9 ++-
 .../legenddataflow/scripts/tier/raw_fcio.py   |  8 ++-
 .../legenddataflow/scripts/tier/raw_orca.py   | 21 +++----
 5 files changed, 83 insertions(+), 18 deletions(-)

diff --git a/workflow/rules/raw.smk b/workflow/rules/raw.smk
index 25f9b37e..86001173 100644
--- a/workflow/rules/raw.smk
+++ b/workflow/rules/raw.smk
@@ -48,6 +48,62 @@ rule build_raw_orca:
         "{params.ro_input} {output}"
 
 
+rule build_raw_orca_bz2:
+    """
+    This rule runs build_raw, it takes in a file.{daq_ext} and outputs a raw file
+    """
+    input:
+        get_pattern_tier_daq(config, extension="orca.bz2"),
+    params:
+        timestamp="{timestamp}",
+        datatype="{datatype}",
+        ro_input=lambda _, input: ro(input),
+    output:
+        get_pattern_tier(config, "raw", check_in_cycle=check_in_cycle),
+    log:
+        get_pattern_log(config, "tier_raw", time),
+    group:
+        "tier-raw"
+    resources:
+        mem_swap=110,
+        runtime=300,
+    shell:
+        execenv_pyexe(config, "build-tier-raw-orca") + "--log {log} "
+        f"--configs {ro(configs)} "
+        f"--chan-maps {ro(chan_maps)} "
+        "--datatype {params.datatype} "
+        "--timestamp {params.timestamp} "
+        "{params.ro_input} {output}"
+
+
+rule build_raw_orca_gzip:
+    """
+    This rule runs build_raw, it takes in a file.{daq_ext} and outputs a raw file
+    """
+    input:
+        get_pattern_tier_daq(config, extension="orca.gz"),
+    params:
+        timestamp="{timestamp}",
+        datatype="{datatype}",
+        ro_input=lambda _, input: ro(input),
+    output:
+        get_pattern_tier(config, "raw", check_in_cycle=check_in_cycle),
+    log:
+        get_pattern_log(config, "tier_raw", time),
+    group:
+        "tier-raw"
+    resources:
+        mem_swap=110,
+        runtime=300,
+    shell:
+        execenv_pyexe(config, "build-tier-raw-orca") + "--log {log} "
+        f"--configs {ro(configs)} "
+        f"--chan-maps {ro(chan_maps)} "
+        "--datatype {params.datatype} "
+        "--timestamp {params.timestamp} "
+        "{params.ro_input} {output}"
+
+
 rule build_raw_fcio:
     """
     This rule runs build_raw, it takes in a file.{daq_ext} and outputs a raw file
diff --git a/workflow/src/legenddataflow/FileKey.py b/workflow/src/legenddataflow/FileKey.py
index 6857e05a..43162a24 100644
--- a/workflow/src/legenddataflow/FileKey.py
+++ b/workflow/src/legenddataflow/FileKey.py
@@ -32,7 +32,12 @@ def regex_from_filepattern(filepattern):
             f.append(f"(?P={wildcard})")
         else:
             wildcards.append(wildcard)
-            f.append(f"(?P<{wildcard}>.+)")
+            if wildcard == "ext":
+                f.append(
+                    f"(?P<{wildcard}>.*)"
+                )  # this means ext will capture everything after 1st dot
+            else:
+                f.append(f"(?P<{wildcard}>" + r"[^\.\/]+)")
         last = match.end()
     f.append(re.escape(filepattern[last:]))
     f.append("$")
diff --git a/workflow/src/legenddataflow/create_pars_keylist.py b/workflow/src/legenddataflow/create_pars_keylist.py
index b779b128..45e1a304 100644
--- a/workflow/src/legenddataflow/create_pars_keylist.py
+++ b/workflow/src/legenddataflow/create_pars_keylist.py
@@ -53,9 +53,14 @@ def generate_par_keylist(keys):
 
     @staticmethod
     def match_entries(entry1, entry2):
-        datatype2 = ProcessingFileKey.get_filekey_from_filename(entry2.file[0]).datatype
+        datatype2 = ProcessingFileKey.get_filekey_from_filename(
+            Path(entry2.file[0]).name
+        ).datatype
         for entry in entry1.file:
-            if ProcessingFileKey.get_filekey_from_filename(entry).datatype == datatype2:
+            if (
+                ProcessingFileKey.get_filekey_from_filename(Path(entry).name).datatype
+                == datatype2
+            ):
                 pass
             else:
                 entry2.file.append(entry)
diff --git a/workflow/src/legenddataflow/scripts/tier/raw_fcio.py b/workflow/src/legenddataflow/scripts/tier/raw_fcio.py
index 67942b93..a351d758 100644
--- a/workflow/src/legenddataflow/scripts/tier/raw_fcio.py
+++ b/workflow/src/legenddataflow/scripts/tier/raw_fcio.py
@@ -47,4 +47,10 @@ def build_tier_raw_fcio() -> None:
     if "muon_config" in channel_dict:
         raise NotImplementedError()
 
-    build_raw(args.input, out_spec=all_config, filekey=args.output, **settings)
+    build_raw(
+        args.input,
+        out_spec=all_config,
+        in_stream_type="Flashcam",
+        filekey=args.output,
+        **settings,
+    )
diff --git a/workflow/src/legenddataflow/scripts/tier/raw_orca.py b/workflow/src/legenddataflow/scripts/tier/raw_orca.py
index c4f954b4..47ba3919 100644
--- a/workflow/src/legenddataflow/scripts/tier/raw_orca.py
+++ b/workflow/src/legenddataflow/scripts/tier/raw_orca.py
@@ -100,17 +100,10 @@ def build_tier_raw_orca() -> None:
         )
         Props.add_to(all_config, aux_config)
 
-    if "muon_config" in list(channel_dict):
-        muon_config = Props.read_from(channel_dict["muon_config"])
-        muon_channels = list(
-            chmap.channelmaps.on(args.timestamp)
-            .map("system", unique=False)["muon"]
-            .map("daq.rawid")
-        )
-        top_key = next(iter(muon_config))
-        muon_config[top_key][next(iter(muon_config[top_key]))]["key_list"] = sorted(
-            muon_channels
-        )
-        Props.add_to(all_config, muon_config)
-
-    build_raw(args.input, out_spec=all_config, filekey=args.output, **settings)
+    build_raw(
+        args.input,
+        out_spec=all_config,
+        in_stream_type="ORCA",
+        filekey=args.output,
+        **settings,
+    )

From 9a6f95535b2d06a1c83a2a48ba2e7785109869c7 Mon Sep 17 00:00:00 2001
From: ggmarshall <george.marshall.20@ucl.ac.uk>
Date: Wed, 12 Mar 2025 21:53:16 +0100
Subject: [PATCH 3/3] reuse orca rule

---
 workflow/rules/raw.smk | 50 ++----------------------------------------
 1 file changed, 2 insertions(+), 48 deletions(-)

diff --git a/workflow/rules/raw.smk b/workflow/rules/raw.smk
index 86001173..e001047b 100644
--- a/workflow/rules/raw.smk
+++ b/workflow/rules/raw.smk
@@ -48,60 +48,14 @@ rule build_raw_orca:
         "{params.ro_input} {output}"
 
 
-rule build_raw_orca_bz2:
-    """
-    This rule runs build_raw, it takes in a file.{daq_ext} and outputs a raw file
-    """
+use rule build_raw_orca as build_raw_orca_bz2 with:
     input:
         get_pattern_tier_daq(config, extension="orca.bz2"),
-    params:
-        timestamp="{timestamp}",
-        datatype="{datatype}",
-        ro_input=lambda _, input: ro(input),
-    output:
-        get_pattern_tier(config, "raw", check_in_cycle=check_in_cycle),
-    log:
-        get_pattern_log(config, "tier_raw", time),
-    group:
-        "tier-raw"
-    resources:
-        mem_swap=110,
-        runtime=300,
-    shell:
-        execenv_pyexe(config, "build-tier-raw-orca") + "--log {log} "
-        f"--configs {ro(configs)} "
-        f"--chan-maps {ro(chan_maps)} "
-        "--datatype {params.datatype} "
-        "--timestamp {params.timestamp} "
-        "{params.ro_input} {output}"
 
 
-rule build_raw_orca_gzip:
-    """
-    This rule runs build_raw, it takes in a file.{daq_ext} and outputs a raw file
-    """
+use rule build_raw_orca as build_raw_orca_gz with:
     input:
         get_pattern_tier_daq(config, extension="orca.gz"),
-    params:
-        timestamp="{timestamp}",
-        datatype="{datatype}",
-        ro_input=lambda _, input: ro(input),
-    output:
-        get_pattern_tier(config, "raw", check_in_cycle=check_in_cycle),
-    log:
-        get_pattern_log(config, "tier_raw", time),
-    group:
-        "tier-raw"
-    resources:
-        mem_swap=110,
-        runtime=300,
-    shell:
-        execenv_pyexe(config, "build-tier-raw-orca") + "--log {log} "
-        f"--configs {ro(configs)} "
-        f"--chan-maps {ro(chan_maps)} "
-        "--datatype {params.datatype} "
-        "--timestamp {params.timestamp} "
-        "{params.ro_input} {output}"
 
 
 rule build_raw_fcio: