Merge branch 'main' into stable

brsynth · Jan 21, 2025 · 16ecc06 · 16ecc06
2 parents 4a27ae2 + c06b3cc
commit 16ecc06
Show file tree

Hide file tree

Showing 10 changed files with 1,272 additions and 65 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include icfree/learner/README.md
diff --git a/README.md b/README.md
@@ -13,12 +13,15 @@ iCFree is a Python-based program designed to automate the process of generating
       - [Sampler](#sampler)
         - [Usage](#usage-1)
         - [Arguments](#arguments)
-      - [Plate Generator](#plate-generator)
+      - [Plate Designer](#plate-designer)
         - [Usage](#usage-2)
         - [Options](#options)
       - [Instructor](#instructor)
         - [Usage](#usage-3)
         - [Options](#options-1)
+      - [Learner](#learner)
+        - [Usage](#usage-4)
+        - [Options](#options-2)
     - [Example](#example)
   - [License](#license)
   - [Authors](#authors)
@@ -45,7 +48,7 @@ The main entry point of the program is the `__main__.py` file. You can run the p
 ### Basic Command
 
 ```bash
-python -m icfree --sampler_input_filename <input_file> --sampler_nb_samples <number_of_samples> --sampler_seed <seed> --sampler_output_filename <output_file> --plate_generator_input_filename <input_file> --plate_generator_sample_volume <volume> --plate_generator_default_dead_volume <dead_volume> --plate_generator_num_replicates <replicates> --plate_generator_well_capacity <capacity> --plate_generator_start_well_src_plt <start_well_src> --plate_generator_start_well_dst_plt <start_well_dst> --plate_generat...
+python -m icfree --sampler_input_filename <input_file> --sampler_nb_samples <number_of_samples> --sampler_seed <seed> --sampler_output_filename <output_file> --plate_designer_input_filename <input_file> --plate_designer_sample_volume <volume> --plate_designer_default_dead_volume <dead_volume> --plate_designer_num_replicates <replicates> --plate_designer_well_capacity <capacity> --plate_designer_start_well_src_plt <start_well_src> --plate_designer_start_well_dst_plt <start_well_dst> --plate_generat...
 ```
 
 ### Components
@@ -67,13 +70,13 @@ python icfree/sampler.py <input_file> <output_file> <num_samples> [--step <step_
 - --step: Step size for creating discrete ranges (default: 2.5).
 - --seed: Seed for random number generation for reproducibility (optional).
 
-#### Plate Generator
-The plate_generator.py script generates plates based on the sampled data.
+#### Plate Designer
+The plate_designer.py script generates plates based on the sampled data.
 
 ##### Usage
 
 ```bash
-python icfree/plate_generator.py <input_file> <sample_volume> [options]
+python icfree/plate_designer.py <input_file> <sample_volume> [options]
 ```
 
 ##### Options
@@ -84,6 +87,7 @@ python icfree/plate_generator.py <input_file> <sample_volume> [options]
 - --well_capacity: Well capacity.
 - --start_well_src_plt: Starting well for the source plate.
 - --start_well_dst_plt: Starting well for the destination plate.
+- --extra_wells: Extra wells to add to the plate.
 - --output_folder: Folder to save the output files.
 
 #### Instructor
@@ -101,13 +105,40 @@ python icfree/instructor.py <source_plate> <destination_plate> <output_instructi
 - --split_threshold: Threshold for splitting components.
 - --source_plate_type: Type of the source plate.
 - --split_components: Components to split.
+- --dispense_order: Comma-separated list of component names specifying the dispensing order.
+
+#### Learner
+The Learner module carries out an active learning process to both train the model and explore the space of possible cell-free combinations.
+
+##### Usage
+
+```bash
+python -m icfree.learner <data_folder> <parameter_file> <output_folder> [options]
+```
+
+##### Options
+
+  - --name_list: a comma-separated string of column names or identifiers, converted to a list of strings representing columns that contain labels (y). This separates y columns from the rest (X features). (Default: Yield1,Yield2,Yield3,Yield4,Yield5)
+  - --test: a flag for validating the model; not required to run inside the active learning loop. If not set, skip the validating step.
+  - --nb_rep NB_REP: the number of test repetitions for validating the model behavior. 80% of data is randomly separated for training, and 20% is used for testing. (Default: 100)
+  - --flatten: a flag to indicate whether to flatten Y data. If set, treats each repetition in the same experiment independently; multiple same X values with different y outputs are modeled. Else, calculates the average of y across repetitions and only model with y average.
+  - --seed SEED: the random seed value used for reproducibility in random operations. (Default: 85)
+  - --nb_new_data_predict: The number of new data points sampled from all possible cases. (Default: 1000)
+  - --nb_new_data: The number of new data points selected from the generated ones. These are the data points labeled after active learning loops. `nb_new_data_predict` must be greater than `nb_new_data` to be meaningful. (Default: 50)
+  - --parameter_step: The step size used to decrement the maximum predefined concentration sequentially. For example, if the maximum concentration is `max`, the sequence of concentrations is calculated as: `max - 1 * parameter_step`, `max - 2 * parameter_step`, `max - 3 * parameter_step`, and so on. Each concentration is a candidate for experimental testing. Smaller steps result in more possible combinations to sample. (Default: 10)
+  - --n_group: parameter for the cluster margin algorithm, specifying the number of groups into which generated data will be clustered. (Default: 15)
+  - --km: parameter for the cluster margin algorithm, specifying the number of data points for the first selection. Ensure `nb_new_data_predict > ks > km`. (Default: 50)
+  - --ks: parameter for the cluster margin algorithm, specifying the number of data points for the second selection. This is also similar to `nb_new_data`. (Default: 20)
+  - --plot: a flag to indicate whether to generate all plots for analysis visualization.
+  - --save_plot: a flag to indicate whether to save all generated plots.
+  - --verbose: flag to indicate whether to print all messages to the console.
 
 ### Example
 
 Here is an example of how to run the program with sample data:
 
 ```bash
-python -m icfree --sampler_input_filename data/components.csv --sampler_nb_samples 100 --sampler_seed 42 --sampler_output_filename results/samples.csv --plate_generator_input_filename results/samples.csv --plate_generator_sample_volume 10 --plate_generator_default_dead_volume 2 --plate_generator_num_replicates 3 --plate_generator_well_capacity 200 --plate_generator_start_well_src_plt A1 --plate_generator_start_well_dst_plt B1 --plate_generator_output_folder results/plates --instructor_max_transfer_volume...
+python -m icfree --sampler_input_filename data/components.csv --sampler_nb_samples 100 --sampler_seed 42 --sampler_output_filename results/samples.csv --plate_designer_input_filename results/samples.csv --plate_designer_sample_volume 10 --plate_designer_default_dead_volume 2 --plate_designer_num_replicates 3 --plate_designer_well_capacity 200 --plate_designer_start_well_src_plt A1 --plate_designer_start_well_dst_plt B1 --plate_designer_output_folder results/plates --instructor_max_transfer_volume...
 ```
 
 ## License

diff --git a/icfree/instructor.py b/icfree/instructor.py
@@ -92,8 +92,27 @@ def split_volumes(source_well, dest_well, volume, component, plate_type):
     instructions_df = pd.DataFrame(instructions)
     return instructions_df.groupby('Sample ID', as_index=False).apply(lambda x: x)
 
+def reorder_by_dispense_order(df, dispense_order):
+    """
+    Reorders the rows of a DataFrame based on the specified dispensing order.
+    
+    Args:
+        df (DataFrame): The DataFrame to reorder.
+        dispense_order (list): List of component names specifying the desired order.
+    
+    Returns:
+        DataFrame: Reordered DataFrame.
+    """
+    if dispense_order:
+        # Create a mapping of components to their order
+        order_mapping = {component: i for i, component in enumerate(dispense_order)}
+        default_order = len(dispense_order)  # Unspecified components go to the end
+        df['Dispense Order'] = df['Sample ID'].map(lambda x: order_mapping.get(x, default_order))
+        df = df.sort_values(by=['Dispense Order', 'Sample ID']).drop(columns=['Dispense Order'])
+    return df
+
 def main(source_plate_file, destination_plate_file, output_file, source_plate_type="default:384PP_AQ_GP3",
-         max_transfer_volume=None, split_threshold=None, split_components=None):
+         max_transfer_volume=None, split_threshold=None, split_components=None, dispense_order=None):
     """
     Main function to read input files, generate ECHO instructions, and write the output to files.
     
@@ -105,36 +124,33 @@ def main(source_plate_file, destination_plate_file, output_file, source_plate_ty
     - max_transfer_volume: Maximum volume for a single transfer. If not specified, no splitting will be performed.
     - split_threshold: Volume threshold above which transfers need to be split. If not specified, no splitting will be performed.
     - split_components: Comma-separated list of component names to create separate files for.
+    - dispense_order: Comma-separated list of component names specifying dispensing order.
     """
-    # Parse the source plate types from the string argument
     source_plate_types = parse_plate_types(source_plate_type)
-
-    # Read the source and destination plate data from CSV files
     source_plate_df = pd.read_csv(source_plate_file)
     destination_plate_df = pd.read_csv(destination_plate_file)
-
-    # Generate the ECHO instructions
     instructions_df = generate_echo_instructions(source_plate_df, destination_plate_df, source_plate_types,
                                                  max_transfer_volume, split_threshold)
 
-    # Handle splitting of components into separate files if specified
+    if dispense_order:
+        dispense_order_list = dispense_order.split(',')
+        instructions_df = reorder_by_dispense_order(instructions_df, dispense_order_list)
+
     if split_components:
         split_components_list = split_components.split(',')
         for component in split_components_list:
             component_df = instructions_df[instructions_df['Sample ID'] == component]
             component_output_file = f"{os.path.splitext(output_file)[0]}_{component}.csv"
             component_df.to_csv(component_output_file, index=False)
-            print(f"Instructions for {component} have been generated and saved to {component_output_file}")
+            print(f"Instructions for {component} saved to {component_output_file}")
 
-        # Write remaining components to the original output file
         remaining_df = instructions_df[~instructions_df['Sample ID'].isin(split_components_list)]
         if not remaining_df.empty:
             remaining_df.to_csv(output_file, index=False)
-            print(f"Instructions for remaining components have been generated and saved to {output_file}")
+            print(f"Remaining instructions saved to {output_file}")
     else:
-        # Write all instructions to the original output file
         instructions_df.to_csv(output_file, index=False)
-        print(f"Instructions have been generated and saved to {output_file}")
+        print(f"Instructions saved to {output_file}")
 
 if __name__ == "__main__":
     import sys
@@ -144,11 +160,11 @@ def main(source_plate_file, destination_plate_file, output_file, source_plate_ty
     parser.add_argument("output_file", type=str, help="Path to the output instructions file.")
     parser.add_argument("--source_plate_type", type=str, default="default:384PP_AQ_GP3",
                         help="Comma-separated list of component and plate type pairs, e.g., 'Component_1:384PP_AQ_CP,Component_2:384PP_AQ_GP3'. Default for all is 384PP_AQ_GP3.")
-    parser.add_argument("--max_transfer_volume", type=int, help="Maximum volume for a single transfer. If not specified, no splitting will be performed.")
-    parser.add_argument("--split_threshold", type=int, help="Volume threshold above which transfers need to be split. If not specified, no splitting will be performed.")
-    parser.add_argument("--split_components", type=str, help="Comma-separated list of component names to create separate files for.")
-
+    parser.add_argument("--max_transfer_volume", type=int, help="Maximum volume for a single transfer. No splitting if not specified.")
+    parser.add_argument("--split_threshold", type=int, help="Volume threshold for splitting transfers. No splitting if not specified.")
+    parser.add_argument("--split_components", type=str, help="Comma-separated list of components for separate output files.")
+    parser.add_argument("--dispense_order", type=str, help="Comma-separated list of components specifying dispensing order.")
     args = parser.parse_args()
 
     main(args.source_plate_file, args.destination_plate_file, args.output_file, args.source_plate_type,
-         args.max_transfer_volume, args.split_threshold, args.split_components)
+         args.max_transfer_volume, args.split_threshold, args.split_components, args.dispense_order)