diff --git a/pm4py/analysis.py b/pm4py/analysis.py index a0584145c..52e29137e 100644 --- a/pm4py/analysis.py +++ b/pm4py/analysis.py @@ -34,18 +34,26 @@ import deprecation -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="this method will be removed in a future release.") -def construct_synchronous_product_net(trace: Trace, petri_net: PetriNet, initial_marking: Marking, - final_marking: Marking) -> Tuple[PetriNet, Marking, Marking]: - """ - constructs the synchronous product net between a trace and a Petri net process model. - - :param trace: trace of an event log - :param petri_net: petri net - :param initial_marking: initial marking - :param final_marking: final marking - - :rtype: ``Tuple[PetriNet, Marking, Marking]`` +@deprecation.deprecated( + deprecated_in="2.3.0", + removed_in="3.0.0", + details="this method will be removed in a future release." +) +def construct_synchronous_product_net( + trace: Trace, + petri_net: PetriNet, + initial_marking: Marking, + final_marking: Marking +) -> Tuple[PetriNet, Marking, Marking]: + """ + Constructs the synchronous product net between a trace and a Petri net process model. + + :param trace: A trace from an event log. + :param petri_net: The Petri net process model. + :param initial_marking: The initial marking of the Petri net. + :param final_marking: The final marking of the Petri net. + :return: A tuple containing the synchronous Petri net, the initial marking, and the final marking. + :rtype: Tuple[PetriNet, Marking, Marking] .. code-block:: python3 @@ -59,19 +67,29 @@ def construct_synchronous_product_net(trace: Trace, petri_net: PetriNet, initial from pm4py.objects.petri_net.utils.synchronous_product import construct from pm4py.objects.petri_net.utils.align_utils import SKIP trace_net, trace_im, trace_fm = construct_trace_net(trace) - sync_net, sync_im, sync_fm = construct(trace_net, trace_im, trace_fm, petri_net, initial_marking, final_marking, - SKIP) + sync_net, sync_im, sync_fm = construct( + trace_net, + trace_im, + trace_fm, + petri_net, + initial_marking, + final_marking, + SKIP + ) return sync_net, sync_im, sync_fm -def compute_emd(language1: Dict[List[str], float], language2: Dict[List[str], float]) -> float: +def compute_emd( + language1: Dict[List[str], float], + language2: Dict[List[str], float] +) -> float: """ - Computes the earth mover distance between two stochastic languages (for example, the first extracted from the log, - and the second extracted from the process model. + Computes the Earth Mover Distance (EMD) between two stochastic languages. For example, one language may be extracted from a log, and the other from a process model. - :param language1: (first) stochastic language - :param language2: (second) stochastic language - :rtype: ``float`` + :param language1: The first stochastic language. + :param language2: The second stochastic language. + :return: The computed Earth Mover Distance. + :rtype: float .. code-block:: python3 @@ -90,18 +108,21 @@ def compute_emd(language1: Dict[List[str], float], language2: Dict[List[str], fl return earth_mover_distance.apply(language1, language2) -def solve_marking_equation(petri_net: PetriNet, initial_marking: Marking, - final_marking: Marking, cost_function: Dict[PetriNet.Transition, float] = None) -> float: +def solve_marking_equation( + petri_net: PetriNet, + initial_marking: Marking, + final_marking: Marking, + cost_function: Dict[PetriNet.Transition, float] = None +) -> float: """ - Solves the marking equation of a Petri net. - The marking equation is solved as an ILP problem. - An optional transition-based cost function to minimize can be provided as well. + Solves the marking equation of a Petri net using an Integer Linear Programming (ILP) approach. An optional transition-based cost function can be provided to minimize the solution. - :param petri_net: petri net - :param initial_marking: initial marking - :param final_marking: final marking - :param cost_function: optional cost function to use when solving the marking equation - :rtype: ``float`` + :param petri_net: The Petri net. + :param initial_marking: The initial marking of the Petri net. + :param final_marking: The final marking of the Petri net. + :param cost_function: (Optional) A dictionary mapping transitions to their associated costs. If not provided, a default cost of 1 is assigned to each transition. + :return: The heuristic value obtained by solving the marking equation. + :rtype: float .. code-block:: python3 @@ -113,31 +134,42 @@ def solve_marking_equation(petri_net: PetriNet, initial_marking: Marking, from pm4py.algo.analysis.marking_equation import algorithm as marking_equation if cost_function is None: - cost_function = dict() - for t in petri_net.transitions: - cost_function[t] = 1 + cost_function = {t: 1 for t in petri_net.transitions} me = marking_equation.build( - petri_net, initial_marking, final_marking, parameters={'costs': cost_function}) + petri_net, + initial_marking, + final_marking, + parameters={'costs': cost_function} + ) return marking_equation.get_h_value(me) -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="this method will be removed in a future release.") -def solve_extended_marking_equation(trace: Trace, sync_net: PetriNet, sync_im: Marking, - sync_fm: Marking, split_points: Optional[List[int]] = None) -> float: - """ - Gets an heuristics value (underestimation of the cost of an alignment) between a trace - and a synchronous product net using the extended marking equation with the standard cost function - (e.g. sync moves get cost equal to 0, invisible moves get cost equal to 1, - other move on model / move on log get cost equal to 10000), with an optimal provisioning of the split - points - - :param trace: trace - :param sync_net: synchronous product net - :param sync_im: initial marking (of the sync net) - :param sync_fm: final marking (of the sync net) - :param split_points: if specified, the indexes of the events of the trace to be used as split points. If not specified, the split points are identified automatically. - :rtype: ``float`` +@deprecation.deprecated( + deprecated_in="2.3.0", + removed_in="3.0.0", + details="this method will be removed in a future release." +) +def solve_extended_marking_equation( + trace: Trace, + sync_net: PetriNet, + sync_im: Marking, + sync_fm: Marking, + split_points: Optional[List[int]] = None +) -> float: + """ + Computes a heuristic value (an underestimation of the cost of an alignment) between a trace + and a synchronous product net using the extended marking equation with the standard cost function. + For example, synchronization moves have a cost of 0, invisible moves have a cost of 1, + and other moves on the model or log have a cost of 10,000. This method provides optimal provisioning of the split points. + + :param trace: The trace to evaluate. + :param sync_net: The synchronous product net. + :param sync_im: The initial marking of the synchronous net. + :param sync_fm: The final marking of the synchronous net. + :param split_points: (Optional) The indices of the events in the trace to be used as split points. If not specified, the split points are identified automatically. + :return: The heuristic value representing the cost underestimation. + :rtype: float .. code-block:: python3 @@ -150,34 +182,46 @@ def solve_extended_marking_equation(trace: Trace, sync_net: PetriNet, sync_im: M from pm4py.algo.analysis.extended_marking_equation import algorithm as extended_marking_equation parameters = {} if split_points is not None: - parameters[extended_marking_equation.Variants.CLASSIC.value.Parameters.SPLIT_IDX] = split_points + parameters[ + extended_marking_equation.Variants.CLASSIC.value.Parameters.SPLIT_IDX + ] = split_points me = extended_marking_equation.build( - trace, sync_net, sync_im, sync_fm, parameters=parameters) + trace, sync_net, sync_im, sync_fm, parameters=parameters + ) return extended_marking_equation.get_h_value(me) -def check_soundness(petri_net: PetriNet, initial_marking: Marking, - final_marking: Marking, print_diagnostics: bool = False) -> Tuple[bool, Dict[str, Any]]: - """ - Check if a given Petri net is a sound WF-net. - A Petri net is a WF-net iff: - - it has a unique source place - - it has a unique end place - - every element in the WF-net is on a path from the source to the sink place - A WF-net is sound iff: - - it contains no live-locks - - it contains no deadlocks - - we are able to always reach the final marking - For a formal definition of sound WF-net, consider: http://www.padsweb.rwth-aachen.de/wvdaalst/publications/p628.pdf - In the returned object, the first element is a boolean indicating if the Petri net is a sound workflow net. - The second element is a set of diagnostics collected while running WOFLAN - (expressed as a dictionary associating the keys [name of the diagnostics] with the corresponding diagnostics). - - :param petri_net: petri net - :param initial_marking: initial marking - :param final_marking: final marking - :param print_diagnostics: boolean value that sets up additional prints during the execution of WOFLAN - :rtype: ``Tuple[bool, Dict[str, Any]]`` +def check_soundness( + petri_net: PetriNet, + initial_marking: Marking, + final_marking: Marking, + print_diagnostics: bool = False +) -> Tuple[bool, Dict[str, Any]]: + """ + Checks if a given Petri net is a sound Workflow net (WF-net). + + A Petri net is a WF-net if and only if: + - It has a unique source place. + - It has a unique end place. + - Every element in the WF-net is on a path from the source to the sink place. + + A WF-net is sound if and only if: + - It contains no live-locks. + - It contains no deadlocks. + - It is always possible to reach the final marking from any reachable marking. + + For a formal definition of a sound WF-net, refer to: http://www.padsweb.rwth-aachen.de/wvdaalst/publications/p628.pdf + + The returned tuple consists of: + - A boolean indicating whether the Petri net is a sound WF-net. + - A dictionary containing diagnostics collected while running WOFLAN, associating diagnostic names with their corresponding details. + + :param petri_net: The Petri net to check. + :param initial_marking: The initial marking of the Petri net. + :param final_marking: The final marking of the Petri net. + :param print_diagnostics: If True, additional diagnostics will be printed during the execution of WOFLAN. + :return: A tuple containing a boolean indicating soundness and a dictionary of diagnostics. + :rtype: Tuple[bool, Dict[str, Any]] .. code-block:: python3 @@ -187,22 +231,35 @@ def check_soundness(petri_net: PetriNet, initial_marking: Marking, is_sound = pm4py.check_soundness(net, im, fm) """ from pm4py.algo.analysis.woflan import algorithm as woflan - return woflan.apply(petri_net, initial_marking, final_marking, - parameters={"return_asap_when_not_sound": True, "return_diagnostics": True, "print_diagnostics": print_diagnostics}) - - -def cluster_log(log: Union[EventLog, EventStream, pd.DataFrame], sklearn_clusterer=None, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Generator[EventLog, None, None]: - """ - Apply clustering to the provided event log - (method based on the extraction of profiles for the traces of the event log) - based on a Scikit-Learn clusterer (default: K-means with two clusters) - - :param log: log object - :param sklearn_clusterer: the Scikit-Learn clusterer to be used (default: KMeans(n_clusters=2, random_state=0, n_init="auto")) - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Generator[pd.DataFrame, None, None]`` + return woflan.apply( + petri_net, + initial_marking, + final_marking, + parameters={ + "return_asap_when_not_sound": True, + "return_diagnostics": True, + "print_diagnostics": print_diagnostics + } + ) + + +def cluster_log( + log: Union[EventLog, EventStream, pd.DataFrame], + sklearn_clusterer=None, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Generator[EventLog, None, None]: + """ + Applies clustering to the provided event log by extracting profiles for the log's traces and clustering them using a Scikit-Learn clusterer (default is K-Means with two clusters). + + :param log: The event log to cluster. + :param sklearn_clusterer: (Optional) The Scikit-Learn clusterer to use. Default is KMeans with `n_clusters=2`, `random_state=0`, and `n_init="auto"`. + :param activity_key: The key used to identify activities in the log. + :param timestamp_key: The key used to identify timestamps in the log. + :param case_id_key: The key used to identify case IDs in the log. + :return: A generator that yields clustered event logs as pandas DataFrames. + :rtype: Generator[pd.DataFrame, None, None] .. code-block:: python3 @@ -213,7 +270,12 @@ def cluster_log(log: Union[EventLog, EventStream, pd.DataFrame], sklearn_cluster """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) + properties = get_properties( + log, + activity_key=activity_key, + case_id_key=case_id_key, + timestamp_key=timestamp_key + ) if sklearn_clusterer is not None: properties["sklearn_clusterer"] = sklearn_clusterer @@ -221,32 +283,52 @@ def cluster_log(log: Union[EventLog, EventStream, pd.DataFrame], sklearn_cluster return clusterer.apply(log, parameters=properties) -def insert_artificial_start_end(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", artificial_start=constants.DEFAULT_ARTIFICIAL_START_ACTIVITY, artificial_end=constants.DEFAULT_ARTIFICIAL_END_ACTIVITY) -> Union[EventLog, pd.DataFrame]: +def insert_artificial_start_end( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + artificial_start=constants.DEFAULT_ARTIFICIAL_START_ACTIVITY, + artificial_end=constants.DEFAULT_ARTIFICIAL_END_ACTIVITY +) -> Union[EventLog, pd.DataFrame]: """ - Inserts the artificial start/end activities in an event log / Pandas dataframe + Inserts artificial start and end activities into an event log or a Pandas DataFrame. - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param artificial_start: the symbol to be used as artificial start activity - :param artificial_end: the symbol to be used as artificial end activity - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: The event log or Pandas DataFrame to modify. + :param activity_key: The attribute key used for activities. + :param timestamp_key: The attribute key used for timestamps. + :param case_id_key: The attribute key used to identify cases. + :param artificial_start: The symbol to use for the artificial start activity. + :param artificial_end: The symbol to use for the artificial end activity. + :return: The event log or Pandas DataFrame with artificial start and end activities inserted. + :rtype: Union[EventLog, pd.DataFrame] .. code-block:: python3 import pm4py - dataframe = pm4py.insert_artificial_start_end(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + dataframe = pm4py.insert_artificial_start_end( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) + properties = get_properties( + log, + activity_key=activity_key, + case_id_key=case_id_key, + timestamp_key=timestamp_key + ) properties[constants.PARAM_ARTIFICIAL_START_ACTIVITY] = artificial_start properties[constants.PARAM_ARTIFICIAL_END_ACTIVITY] = artificial_end if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) + check_pandas_dataframe_columns( + log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key + ) from pm4py.objects.log.util import dataframe_utils return dataframe_utils.insert_artificial_start_end(log, parameters=properties) else: @@ -254,103 +336,167 @@ def insert_artificial_start_end(log: Union[EventLog, pd.DataFrame], activity_key return artificial.insert_artificial_start_end(log, parameters=properties) -def insert_case_service_waiting_time(log: Union[EventLog, pd.DataFrame], service_time_column: str = "@@service_time", - sojourn_time_column: str = "@@sojourn_time", - waiting_time_column: str = "@@waiting_time", activity_key: str = "concept:name", - timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", - start_timestamp_key: str = "time:timestamp") -> pd.DataFrame: - """ - Inserts the service/waiting/sojourn times of the case in the dataframe. - - :param log: event log / Pandas dataframe - :param service_time_column: column to be used for the service time - :param sojourn_time_column: column to be used for the sojourn time - :param waiting_time_column: column to be used for the waiting time - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param start_timestamp_key: attribute to be used as start timestamp - :rtype: ``pd.DataFrame`` +def insert_case_service_waiting_time( + log: Union[EventLog, pd.DataFrame], + service_time_column: str = "@@service_time", + sojourn_time_column: str = "@@sojourn_time", + waiting_time_column: str = "@@waiting_time", + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + start_timestamp_key: str = "time:timestamp" +) -> pd.DataFrame: + """ + Inserts service time, waiting time, and sojourn time information for each case into a Pandas DataFrame. + + :param log: The event log or Pandas DataFrame to modify. + :param service_time_column: The name of the column to store service times. + :param sojourn_time_column: The name of the column to store sojourn times. + :param waiting_time_column: The name of the column to store waiting times. + :param activity_key: The attribute key used for activities. + :param timestamp_key: The attribute key used for timestamps. + :param case_id_key: The attribute key used to identify cases. + :param start_timestamp_key: The attribute key used for the start timestamp of cases. + :return: A Pandas DataFrame with the inserted service, waiting, and sojourn time columns. + :rtype: pd.DataFrame .. code-block:: python3 import pm4py - dataframe = pm4py.insert_case_service_waiting_time(dataframe, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', start_timestamp_key='time:timestamp') + dataframe = pm4py.insert_case_service_waiting_time( + dataframe, + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name', + start_timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) + properties = get_properties( + log, + activity_key=activity_key, + case_id_key=case_id_key, + timestamp_key=timestamp_key + ) from pm4py.objects.conversion.log import converter as log_converter - log = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=properties) - - return pandas_utils.insert_case_service_waiting_time(log, case_id_column=case_id_key, timestamp_column=timestamp_key, start_timestamp_column=start_timestamp_key, service_time_column=service_time_column, waiting_time_column=waiting_time_column, sojourn_time_column=sojourn_time_column) - - -def insert_case_arrival_finish_rate(log: Union[EventLog, pd.DataFrame], arrival_rate_column="@@arrival_rate", finish_rate_column="@@finish_rate", - activity_key: str = "concept:name", - timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", - start_timestamp_key: str = "time:timestamp") -> pd.DataFrame: - """ - Inserts the arrival/finish rates of the case in the dataframe. - The arrival rate is computed as the difference between the start time of the case and the start time of the previous case to start. - The finish rate is computed as the difference between the end time of the case and the end time of the next case to end. - - :param log: event log / Pandas dataframe - :param arrival_rate_column: column to be used for the arrival rate - :param finish_rate_column: column to be used for the finish rate - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param start_timestamp_key: attribute to be used as start timestamp - :rtype: ``pd.DataFrame`` + log_df = log_converter.apply( + log, + variant=log_converter.Variants.TO_DATA_FRAME, + parameters=properties + ) + + return pandas_utils.insert_case_service_waiting_time( + log_df, + case_id_column=case_id_key, + timestamp_column=timestamp_key, + start_timestamp_column=start_timestamp_key, + service_time_column=service_time_column, + waiting_time_column=waiting_time_column, + sojourn_time_column=sojourn_time_column + ) + + +def insert_case_arrival_finish_rate( + log: Union[EventLog, pd.DataFrame], + arrival_rate_column: str = "@@arrival_rate", + finish_rate_column: str = "@@finish_rate", + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + start_timestamp_key: str = "time:timestamp" +) -> pd.DataFrame: + """ + Inserts arrival and finish rate information for each case into a Pandas DataFrame. + + The arrival rate is computed as the time difference between the start of the current case and the start of the previous case to start. + The finish rate is computed as the time difference between the end of the current case and the end of the next case to finish. + + :param log: The event log or Pandas DataFrame to modify. + :param arrival_rate_column: The name of the column to store arrival rates. + :param finish_rate_column: The name of the column to store finish rates. + :param activity_key: The attribute key used for activities. + :param timestamp_key: The attribute key used for timestamps. + :param case_id_key: The attribute key used to identify cases. + :param start_timestamp_key: The attribute key used for the start timestamp of cases. + :return: A Pandas DataFrame with the inserted arrival and finish rate columns. + :rtype: pd.DataFrame .. code-block:: python3 import pm4py - dataframe = pm4py.insert_case_arrival_finish_rate(dataframe, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', start_timestamp_key='time:timestamp') + dataframe = pm4py.insert_case_arrival_finish_rate( + dataframe, + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name', + start_timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) + properties = get_properties( + log, + activity_key=activity_key, + case_id_key=case_id_key, + timestamp_key=timestamp_key + ) from pm4py.objects.conversion.log import converter as log_converter - log = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=properties) - - return pandas_utils.insert_case_arrival_finish_rate(log, case_id_column=case_id_key, timestamp_column=timestamp_key, start_timestamp_column=start_timestamp_key, arrival_rate_column=arrival_rate_column, finish_rate_column=finish_rate_column) + log_df = log_converter.apply( + log, + variant=log_converter.Variants.TO_DATA_FRAME, + parameters=properties + ) + + return pandas_utils.insert_case_arrival_finish_rate( + log_df, + case_id_column=case_id_key, + timestamp_column=timestamp_key, + start_timestamp_column=start_timestamp_key, + arrival_rate_column=arrival_rate_column, + finish_rate_column=finish_rate_column + ) def check_is_workflow_net(net: PetriNet) -> bool: """ - Checks if the input Petri net satisfies the WF-net conditions: - 1. unique source place - 2. unique sink place - 3. every node is on a path from the source to the sink + Checks if the input Petri net satisfies the WF-net (Workflow net) conditions: + 1. It has a unique source place. + 2. It has a unique sink place. + 3. Every node is on a path from the source to the sink. - :param net: petri net - :rtype: ``bool`` + :param net: The Petri net to check. + :return: True if the Petri net is a WF-net, False otherwise. + :rtype: bool .. code-block:: python3 import pm4py - net, im, fm = pm4py.read_pnml('model.pnml') - is_wfnet = pm4py.check_is_workflow_net(net, im, fm) + net = pm4py.read_pnml('model.pnml') + is_wfnet = pm4py.check_is_workflow_net(net) """ from pm4py.algo.analysis.workflow_net import algorithm return algorithm.apply(net) -def maximal_decomposition(net: PetriNet, im: Marking, fm: Marking) -> List[Tuple[PetriNet, Marking, Marking]]: +def maximal_decomposition( + net: PetriNet, + im: Marking, + fm: Marking +) -> List[Tuple[PetriNet, Marking, Marking]]: """ - Calculate the maximal decomposition of an accepting Petri net. + Calculates the maximal decomposition of an accepting Petri net into its maximal components. - :param net: petri net - :param im: initial marking - :param fm: final marking - :rtype: ``List[Tuple[PetriNet, Marking, Marking]]`` + :param net: The Petri net to decompose. + :param im: The initial marking of the Petri net. + :param fm: The final marking of the Petri net. + :return: A list of tuples, each containing a subnet Petri net, its initial marking, and its final marking. + :rtype: List[Tuple[PetriNet, Marking, Marking]] .. code-block:: python3 @@ -358,35 +504,44 @@ def maximal_decomposition(net: PetriNet, im: Marking, fm: Marking) -> List[Tuple net, im, fm = pm4py.read_pnml('model.pnml') list_nets = pm4py.maximal_decomposition(net, im, fm) - for anet in list_nets: - subnet, subim, subfm = anet + for subnet, subim, subfm in list_nets: pm4py.view_petri_net(subnet, subim, subfm, format='svg') """ from pm4py.objects.petri_net.utils.decomposition import decompose return decompose(net, im, fm) -def simplicity_petri_net(net: PetriNet, im: Marking, fm: Marking, variant: Optional[str] = "arc_degree") -> float: +def simplicity_petri_net( + net: PetriNet, + im: Marking, + fm: Marking, + variant: Optional[str] = "arc_degree" +) -> float: """ Computes the simplicity metric for a given Petri net model. - The three available approaches are: - - Arc degree simplicity: described in the paper Vázquez-Barreiros, Borja, Manuel Mucientes, and Manuel Lama. "ProDiGen: Mining complete, precise and minimal structure process models with a genetic algorithm." Information Sciences 294 (2015): 315-333. - - Extended cardoso metric: described in the paper "Complexity Metrics for Workflow Nets" Lassen, Kristian Bisgaard, and Wil MP van der Aalst - - Extended cyclomatic metric: described in the paper "Complexity Metrics for Workflow Nets" Lassen, Kristian Bisgaard, and Wil MP van der Aalst - + Three available approaches are supported: + - **Arc Degree Simplicity**: Described in the paper "ProDiGen: Mining complete, precise and minimal structure process models with a genetic algorithm." by Vázquez-Barreiros, Borja, Manuel Mucientes, and Manuel Lama. Information Sciences, 294 (2015): 315-333. + - **Extended Cardoso Metric**: Described in the paper "Complexity Metrics for Workflow Nets" by Lassen, Kristian Bisgaard, and Wil MP van der Aalst. + - **Extended Cyclomatic Metric**: Also described in the paper "Complexity Metrics for Workflow Nets" by Lassen, Kristian Bisgaard, and Wil MP van der Aalst. - :param net: petri net - :param im: initial marking - :param fm: final marking - :param variant: variant to be used ('arc_degree', 'extended_cardoso', 'extended_cyclomatic') - :rtype: ``float`` + :param net: The Petri net for which to compute simplicity. + :param im: The initial marking of the Petri net. + :param fm: The final marking of the Petri net. + :param variant: The simplicity metric variant to use ('arc_degree', 'extended_cardoso', 'extended_cyclomatic'). + :return: The computed simplicity value. + :rtype: float .. code-block:: python3 import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) simplicity = pm4py.simplicity_petri_net(net, im, fm, variant='arc_degree') """ if variant == "arc_degree": @@ -400,13 +555,21 @@ def simplicity_petri_net(net: PetriNet, im: Marking, fm: Marking, variant: Optio return extended_cyclomatic.apply(net, im) -def generate_marking(net: PetriNet, place_or_dct_places: Union[str, PetriNet.Place, Dict[str, int], Dict[PetriNet.Place, int]]) -> Marking: +def generate_marking( + net: PetriNet, + place_or_dct_places: Union[str, PetriNet.Place, Dict[str, int], Dict[PetriNet.Place, int]] +) -> Marking: """ - Generate a marking for a given Petri net + Generates a marking for a given Petri net based on specified places and token counts. - :param net: petri net - :param place_or_dct_places: place, or dictionary of places, to be used in the marking. Possible values: single Place object for the marking; name of the place for the marking; dictionary associating to each place its number of tokens; dictionary associating to names of places a number of tokens. - :rtype: ``Marking`` + :param net: The Petri net for which to generate the marking. + :param place_or_dct_places: Specifies the places and their token counts for the marking. It can be: + - A single `PetriNet.Place` object, which will have one token. + - A string representing the name of a place, which will have one token. + - A dictionary mapping `PetriNet.Place` objects to their respective number of tokens. + - A dictionary mapping place names (strings) to their respective number of tokens. + :return: The generated Marking object. + :rtype: Marking .. code-block:: python3 @@ -417,28 +580,29 @@ def generate_marking(net: PetriNet, place_or_dct_places: Union[str, PetriNet.Pla """ dct_places = {x.name: x for x in net.places} if isinstance(place_or_dct_places, PetriNet.Place): - # we specified a single Place object for the marking + # A single Place object is specified for the marking return Marking({place_or_dct_places: 1}) elif isinstance(place_or_dct_places, str): - # we specified the name of a place for the marking + # The name of a place is specified for the marking return Marking({dct_places[place_or_dct_places]: 1}) elif isinstance(place_or_dct_places, dict): dct_keys = list(place_or_dct_places) if dct_keys: if isinstance(dct_keys[0], PetriNet.Place): - # we specified a dictionary associating to each place its number of tokens + # A dictionary mapping Place objects to token counts is specified return Marking(place_or_dct_places) elif isinstance(dct_keys[0], str): - # we specified a dictionary associating to names of places a number of tokens + # A dictionary mapping place names to token counts is specified return Marking({dct_places[x]: y for x, y in place_or_dct_places.items()}) def reduce_petri_net_invisibles(net: PetriNet) -> PetriNet: """ - Reduce the number of invisibles transitions in the provided Petri net. + Reduces the number of invisible transitions in the provided Petri net. - :param net: petri net - :rtype: ``PetriNet`` + :param net: The Petri net to be reduced. + :return: The reduced Petri net with fewer invisible transitions. + :rtype: PetriNet .. code-block:: python3 @@ -451,40 +615,49 @@ def reduce_petri_net_invisibles(net: PetriNet) -> PetriNet: return reduction.apply_simple_reduction(net) -def reduce_petri_net_implicit_places(net: PetriNet, im: Marking, fm: Marking) -> Tuple[PetriNet, Marking, Marking]: +def reduce_petri_net_implicit_places( + net: PetriNet, + im: Marking, + fm: Marking +) -> Tuple[PetriNet, Marking, Marking]: """ - Reduce the number of invisibles transitions in the provided Petri net. + Reduces the number of implicit places in the provided Petri net. - :param net: petri net - :param im: initial marking - :param fm: final marking - :rtype: ``Tuple[PetriNet, Marking, Marking]`` + :param net: The Petri net to be reduced. + :param im: The initial marking of the Petri net. + :param fm: The final marking of the Petri net. + :return: A tuple containing the reduced Petri net, its initial marking, and its final marking. + :rtype: Tuple[PetriNet, Marking, Marking] .. code-block:: python3 import pm4py net, im, fm = pm4py.read_pnml('model.pnml') - net = pm4py.reduce_petri_net_implicit_places(net, im, fm) + net, im, fm = pm4py.reduce_petri_net_implicit_places(net, im, fm) """ from pm4py.objects.petri_net.utils import murata return murata.apply_reduction(net, im, fm) -def get_enabled_transitions(net: PetriNet, marking: Marking) -> Set[PetriNet.Transition]: +def get_enabled_transitions( + net: PetriNet, + marking: Marking +) -> Set[PetriNet.Transition]: """ - Gets the transitions enabled in a given marking + Retrieves the set of transitions that are enabled in a given marking of a Petri net. - :param net: Petri net - :param marking: marking - :rtype: ``Set[PetriNet.Transition]`` + :param net: The Petri net. + :param marking: The current marking of the Petri net. + :return: A set of transitions that are enabled in the provided marking. + :rtype: Set[PetriNet.Transition] .. code-block:: python3 import pm4py net, im, fm = pm4py.read_pnml('tests/input_data/running-example.pnml') - # gets the transitions enabled in the initial marking + # Gets the transitions enabled in the initial marking enabled_transitions = pm4py.get_enabled_transitions(net, im) """ from pm4py.objects.petri_net import semantics diff --git a/pm4py/conformance.py b/pm4py/conformance.py index 3a6d4c496..1d34b3abf 100644 --- a/pm4py/conformance.py +++ b/pm4py/conformance.py @@ -20,7 +20,7 @@ Contact: info@processintelligence.solutions ''' __doc__ = """ -The ``pm4py.conformance`` module contains the conformance checking algorithms implemented in ``pm4py`` +The ``pm4py.conformance`` module contains the conformance checking algorithms implemented in ``pm4py``. """ from typing import List, Dict, Any, Union, Optional, Tuple, Set @@ -35,66 +35,101 @@ import deprecation -def conformance_diagnostics_token_based_replay(log: Union[EventLog, pd.DataFrame], petri_net: PetriNet, initial_marking: Marking, - final_marking: Marking, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME, opt_parameters: Optional[Dict[Any, Any]] = None) -> List[Dict[str, Any]]: +def conformance_diagnostics_token_based_replay( + log: Union[EventLog, pd.DataFrame], + petri_net: PetriNet, + initial_marking: Marking, + final_marking: Marking, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME, + opt_parameters: Optional[Dict[Any, Any]] = None +) -> List[Dict[str, Any]]: """ Apply token-based replay for conformance checking analysis. - The methods return the full token-based-replay diagnostics. + This method returns the full token-based replay diagnostics. - Token-based replay matches a trace and a Petri net model, starting from the initial place, in order to discover which transitions are executed and in which places we have remaining or missing tokens for the given process instance. Token-based replay is useful for Conformance Checking: indeed, a trace is fitting according to the model if, during its execution, the transitions can be fired without the need to insert any missing token. If the reaching of the final marking is imposed, then a trace is fitting if it reaches the final marking without any missing or remaining tokens. + Token-based replay matches a trace against a Petri net model, starting from the initial marking, to discover which transitions are executed and in which places there are remaining or missing tokens for the given process instance. Token-based replay is useful for conformance checking: a trace fits the model if, during its execution, all transitions can be fired without the need to insert any missing tokens. If reaching the final marking is imposed, a trace fits if it reaches the final marking without any missing or remaining tokens. - In PM4Py there is an implementation of a token replayer that is able to go across hidden transitions (calculating shortest paths between places) and can be used with any Petri net model with unique visible transitions and hidden transitions. When a visible transition needs to be fired and not all places in the preset are provided with the correct number of tokens, starting from the current marking it is checked if for some place there is a sequence of hidden transitions that could be fired in order to enable the visible transition. The hidden transitions are then fired and a marking that permits to enable the visible transition is reached. + In PM4Py, the token replayer implementation can handle hidden transitions by calculating the shortest paths between places. It can be used with any Petri net model that has unique visible transitions and hidden transitions. When a visible transition needs to be fired and not all places in its preset have the correct number of tokens, the current marking is checked to see if any hidden transitions can be fired to enable the visible transition. The hidden transitions are then fired, reaching a marking that permits the firing of the visible transition. + The approach is described in: Berti, Alessandro, and Wil MP van der Aalst. "Reviving Token-based Replay: Increasing Speed While Improving Diagnostics." ATAED@ Petri Nets/ACSD. 2019. - The output of the token-based replay, stored in the variable replayed_traces, contains for each trace of the log: - - - trace_is_fit: boolean value (True/False) that is true when the trace is according to the model. - - activated_transitions: list of transitions activated in the model by the token-based replay. - - reached_marking: marking reached at the end of the replay. - - missing_tokens: number of missing tokens. - - consumed_tokens: number of consumed tokens. - - remaining_tokens: number of remaining tokens. - - produced_tokens: number of produced tokens. - - :param log: event log - :param petri_net: petri net - :param initial_marking: initial marking - :param final_marking: final marking - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param return_diagnostics_dataframe: if possible, returns a dataframe with the diagnostics (instead of the usual output) - :param opt_parameters: optional parameters of the token-based replay, including: - * reach_mark_through_hidden: boolean value that decides if we shall try to reach the final marking through hidden transitions - * stop_immediately_unfit: boolean value that decides if we shall stop immediately when a non-conformance is detected - * walk_through_hidden_trans: boolean value that decides if we shall walk through hidden transitions in order to enable visible transitions - * places_shortest_path_by_hidden: shortest paths between places by hidden transitions - * is_reduction: expresses if the token-based replay is called in a reduction attempt - * thread_maximum_ex_time: alignment threads maximum allowed execution time - * cleaning_token_flood: decides if a cleaning of the token flood shall be operated - * disable_variants: disable variants grouping - * return_object_names: decides whether names instead of object pointers shall be returned + The output of the token-based replay, stored in the variable `replayed_traces`, contains for each trace in the log: + + - **trace_is_fit**: Boolean value indicating whether the trace conforms to the model. + - **activated_transitions**: List of transitions activated in the model by the token-based replay. + - **reached_marking**: Marking reached at the end of the replay. + - **missing_tokens**: Number of missing tokens. + - **consumed_tokens**: Number of consumed tokens. + - **remaining_tokens**: Number of remaining tokens. + - **produced_tokens**: Number of produced tokens. + + :param log: Event log. + :param petri_net: Petri net. + :param initial_marking: Initial marking. + :param final_marking: Final marking. + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :param return_diagnostics_dataframe: If possible, returns a dataframe with the diagnostics instead of the usual output (default is `constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME`). + :param opt_parameters: Optional parameters for the token-based replay, including: + * **reach_mark_through_hidden**: Boolean to decide if the final marking should be reached through hidden transitions. + * **stop_immediately_unfit**: Boolean to decide if the replay should stop immediately when non-conformance is detected. + * **walk_through_hidden_trans**: Boolean to decide if the replay should walk through hidden transitions to enable visible transitions. + * **places_shortest_path_by_hidden**: Shortest paths between places using hidden transitions. + * **is_reduction**: Indicates if the token-based replay is called in a reduction attempt. + * **thread_maximum_ex_time**: Maximum allowed execution time for alignment threads. + * **cleaning_token_flood**: Decides if token flood cleaning should be performed. + * **disable_variants**: Disable variants grouping. + * **return_object_names**: Decide whether to return names instead of object pointers. + :return: A list of dictionaries containing diagnostics for each trace. :rtype: ``List[Dict[str, Any]]`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - tbr_diagnostics = pm4py.conformance_diagnostics_token_based_replay(dataframe, net, im, fm, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + tbr_diagnostics = pm4py.conformance_diagnostics_token_based_replay( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) if return_diagnostics_dataframe: from pm4py.convert import convert_to_event_log log = convert_to_event_log(log, case_id_key=case_id_key) case_id_key = None - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) if opt_parameters is None: opt_parameters = {} @@ -103,373 +138,654 @@ def conformance_diagnostics_token_based_replay(log: Union[EventLog, pd.DataFrame properties[k] = v from pm4py.algo.conformance.tokenreplay import algorithm as token_replay - result = token_replay.apply(log, petri_net, initial_marking, final_marking, parameters=properties) + result = token_replay.apply( + log, + petri_net, + initial_marking, + final_marking, + parameters=properties + ) if return_diagnostics_dataframe: - return token_replay.get_diagnostics_dataframe(log, result, parameters=properties) + return token_replay.get_diagnostics_dataframe( + log, + result, + parameters=properties + ) return result -def conformance_diagnostics_alignments(log: Union[EventLog, pd.DataFrame], *args, multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", variant_str : Optional[str] = None, return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME, **kwargs) -> List[Dict[str, Any]]: +def conformance_diagnostics_alignments( + log: Union[EventLog, pd.DataFrame], + *args, + multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + variant_str: Optional[str] = None, + return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME, + **kwargs +) -> List[Dict[str, Any]]: """ Apply the alignments algorithm between a log and a process model. - The methods return the full alignment diagnostics. - - Alignment-based replay aims to find one of the best alignment between the trace and the model. For each trace, the output of an alignment is a list of couples where the first element is an event (of the trace) or » and the second element is a transition (of the model) or ». For each couple, the following classification could be provided: - - - Sync move: the classification of the event corresponds to the transition label; in this case, both the trace and the model advance in the same way during the replay. - - Move on log: for couples where the second element is », it corresponds to a replay move in the trace that is not mimicked in the model. This kind of move is unfit and signal a deviation between the trace and the model. - - Move on model: for couples where the first element is », it corresponds to a replay move in the model that is not mimicked in the trace. For moves on model, we can have the following distinction: - * Moves on model involving hidden transitions: in this case, even if it is not a sync move, the move is fit. - * Moves on model not involving hidden transitions: in this case, the move is unfit and signals a deviation between the trace and the model. - - With each trace, a dictionary containing among the others the following information is associated: - - alignment: contains the alignment (sync moves, moves on log, moves on model) - cost: contains the cost of the alignment according to the provided cost function - fitness: is equal to 1 if the trace is perfectly fitting. - - :param log: event log - :param args: specification of the process model - :param multi_processing: boolean value that enables the multiprocessing - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param variant_str: variant specification (for Petri net alignments) - :param return_diagnostics_dataframe: if possible, returns a dataframe with the diagnostics (instead of the usual output) + This method returns the full alignment diagnostics. + + Alignment-based replay aims to find one of the best alignments between the trace and the model. For each trace, the output of an alignment is a list of pairs where the first element is an event (from the trace) or ``»`` and the second element is a transition (from the model) or ``»``. Each pair can be classified as follows: + + - **Sync move**: The event and transition labels correspond, advancing both the trace and the model simultaneously. + - **Move on log**: The transition is ``»``, indicating a replay move in the trace that is not mirrored in the model. This move is unfit and signals a deviation. + - **Move on model**: The event is ``»``, indicating a replay move in the model not mirrored in the trace. These can be further classified as: + * **Moves on model involving hidden transitions**: Even if it's not a sync move, the move is fit. + * **Moves on model not involving hidden transitions**: The move is unfit and signals a deviation. + + For each trace, a dictionary is associated containing, among other details: + + - **alignment**: The alignment pairs (sync moves, moves on log, moves on model). + - **cost**: The cost of the alignment based on the provided cost function. + - **fitness**: Equals 1 if the trace fits perfectly. + + :param log: Event log. + :param args: Specifications of the process model. + :param multi_processing: Boolean to enable multiprocessing (default is `constants.ENABLE_MULTIPROCESSING_DEFAULT`). + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :param variant_str: Variant specification (for Petri net alignments). + :param return_diagnostics_dataframe: If possible, returns a dataframe with the diagnostics instead of the usual output (default is `constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME`). + :return: A list of dictionaries containing diagnostics for each trace. :rtype: ``List[Dict[str, Any]]`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - alignments_diagnostics = pm4py.conformance_diagnostics_alignments(dataframe, net, im, fm, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + alignments_diagnostics = pm4py.conformance_diagnostics_alignments( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) if return_diagnostics_dataframe: from pm4py.convert import convert_to_event_log log = convert_to_event_log(log, case_id_key=case_id_key) case_id_key = None - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) - if kwargs is not None: + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) + if kwargs: for k, v in kwargs.items(): properties[k] = v if len(args) == 3: - if type(args[0]) is PetriNet: + if isinstance(args[0], PetriNet): # Petri net alignments from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments variant = alignments.DEFAULT_VARIANT if variant_str is not None: variant = variant_str if multi_processing: - result = alignments.apply_multiprocessing(log, args[0], args[1], args[2], parameters=properties, variant=variant) + result = alignments.apply_multiprocessing( + log, args[0], args[1], args[2], + parameters=properties, variant=variant + ) else: - result = alignments.apply(log, args[0], args[1], args[2], parameters=properties, variant=variant) + result = alignments.apply( + log, args[0], args[1], args[2], + parameters=properties, variant=variant + ) if return_diagnostics_dataframe: - return alignments.get_diagnostics_dataframe(log, result, parameters=properties) + return alignments.get_diagnostics_dataframe( + log, result, parameters=properties + ) return result elif isinstance(args[0], dict): # DFG alignments from pm4py.algo.conformance.alignments.dfg import algorithm as dfg_alignment - result = dfg_alignment.apply(log, args[0], args[1], args[2], parameters=properties) + result = dfg_alignment.apply( + log, args[0], args[1], args[2], + parameters=properties + ) return result elif len(args) == 1: - if type(args[0]) is ProcessTree: - # process tree alignments + if isinstance(args[0], ProcessTree): + # Process tree alignments from pm4py.algo.conformance.alignments.process_tree.variants import search_graph_pt if multi_processing: - result = search_graph_pt.apply_multiprocessing(log, args[0], parameters=properties) + result = search_graph_pt.apply_multiprocessing( + log, args[0], parameters=properties + ) else: - result = search_graph_pt.apply(log, args[0], parameters=properties) + result = search_graph_pt.apply( + log, args[0], parameters=properties + ) return result - elif type(args[0]) in [EventLog, pd.DataFrame]: - # edit distance alignments (log2log) + elif isinstance(args[0], (EventLog, pd.DataFrame)): + # Edit distance alignments (log to log) from pm4py.algo.conformance.alignments.edit_distance import algorithm as edit_distance_alignments - result = edit_distance_alignments.apply(log, args[0], parameters=properties) + result = edit_distance_alignments.apply( + log, args[0], + parameters=properties + ) return result - # try to convert to Petri net + # Try to convert to Petri net import pm4py from pm4py.algo.conformance.alignments.petri_net import algorithm as alignments net, im, fm = pm4py.convert_to_petri_net(*args) if multi_processing: - result = alignments.apply_multiprocessing(log, net, im, fm, parameters=properties) + result = alignments.apply_multiprocessing( + log, net, im, fm, + parameters=properties + ) else: - result = alignments.apply(log, net, im, fm, parameters=properties) + result = alignments.apply( + log, net, im, fm, + parameters=properties + ) if return_diagnostics_dataframe: - return alignments.get_diagnostics_dataframe(log, result, parameters=properties) + return alignments.get_diagnostics_dataframe( + log, + result, + parameters=properties + ) return result -def fitness_token_based_replay(log: Union[EventLog, pd.DataFrame], petri_net: PetriNet, initial_marking: Marking, final_marking: Marking, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> \ - Dict[ - str, float]: +def fitness_token_based_replay( + log: Union[EventLog, pd.DataFrame], + petri_net: PetriNet, + initial_marking: Marking, + final_marking: Marking, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Dict[str, float]: """ - Calculates the fitness using token-based replay. + Calculate the fitness using token-based replay. The fitness is calculated on a log-based level. The output dictionary contains the following keys: - - perc_fit_traces (the percentage of fit traces (from 0.0 to 100.0)) - - average_trace_fitness (between 0.0 and 1.0; computed as average of the trace fitnesses) - - log_fitness (between 0.0 and 1.0) - - percentage_of_fitting_traces (the percentage of fit traces (from 0.0 to 100.0) + - **perc_fit_traces**: Percentage of fit traces (from 0.0 to 100.0). + - **average_trace_fitness**: Average of the trace fitnesses (between 0.0 and 1.0). + - **log_fitness**: Overall fitness of the log (between 0.0 and 1.0). + - **percentage_of_fitting_traces**: Percentage of fit traces (from 0.0 to 100.0). - Token-based replay matches a trace and a Petri net model, starting from the initial place, in order to discover which transitions are executed and in which places we have remaining or missing tokens for the given process instance. Token-based replay is useful for Conformance Checking: indeed, a trace is fitting according to the model if, during its execution, the transitions can be fired without the need to insert any missing token. If the reaching of the final marking is imposed, then a trace is fitting if it reaches the final marking without any missing or remaining tokens. + Token-based replay matches a trace against a Petri net model, starting from the initial marking, to discover which transitions are executed and in which places there are remaining or missing tokens for the given process instance. Token-based replay is useful for conformance checking: a trace fits the model if, during its execution, all transitions can be fired without the need to insert any missing tokens. If reaching the final marking is imposed, a trace fits if it reaches the final marking without any missing or remaining tokens. - In PM4Py there is an implementation of a token replayer that is able to go across hidden transitions (calculating shortest paths between places) and can be used with any Petri net model with unique visible transitions and hidden transitions. When a visible transition needs to be fired and not all places in the preset are provided with the correct number of tokens, starting from the current marking it is checked if for some place there is a sequence of hidden transitions that could be fired in order to enable the visible transition. The hidden transitions are then fired and a marking that permits to enable the visible transition is reached. + In PM4Py, the token replayer implementation can handle hidden transitions by calculating the shortest paths between places. It can be used with any Petri net model that has unique visible transitions and hidden transitions. When a visible transition needs to be fired and not all places in its preset have the correct number of tokens, the current marking is checked to see if any hidden transitions can be fired to enable the visible transition. The hidden transitions are then fired, reaching a marking that permits the firing of the visible transition. + The approach is described in: Berti, Alessandro, and Wil MP van der Aalst. "Reviving Token-based Replay: Increasing Speed While Improving Diagnostics." ATAED@ Petri Nets/ACSD. 2019. - The calculation of the replay fitness aim to calculate how much of the behavior in the log is admitted by the process model. We propose two methods to calculate replay fitness, based on token-based replay and alignments respectively. + The calculation of replay fitness aims to assess how much of the behavior in the log is admitted by the process model. Two methods are proposed to calculate replay fitness, based on token-based replay and alignments respectively. - For token-based replay, the percentage of traces that are completely fit is returned, along with a fitness value that is calculated as indicated in the scientific contribution + For token-based replay, the percentage of traces that are completely fit is returned, along with a fitness value calculated as indicated in the referenced contribution. - :param log: event log - :param petri_net: petri net - :param initial_marking: initial marking - :param final_marking: final marking - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log. + :param petri_net: Petri net. + :param initial_marking: Initial marking. + :param final_marking: Final marking. + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :return: A dictionary containing fitness metrics. :rtype: ``Dict[str, float]`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - fitness_tbr = pm4py.fitness_token_based_replay(dataframe, net, im, fm, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + fitness_tbr = pm4py.fitness_token_based_replay( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) - - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) + + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness - result = replay_fitness.apply(log, petri_net, initial_marking, final_marking, - variant=replay_fitness.Variants.TOKEN_BASED, parameters=properties) + result = replay_fitness.apply( + log, + petri_net, + initial_marking, + final_marking, + variant=replay_fitness.Variants.TOKEN_BASED, + parameters=properties + ) return result -def fitness_alignments(log: Union[EventLog, pd.DataFrame], petri_net: PetriNet, initial_marking: Marking, final_marking: Marking, multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", variant_str : Optional[str] = None) -> \ - Dict[str, float]: +def fitness_alignments( + log: Union[EventLog, pd.DataFrame], + petri_net: PetriNet, + initial_marking: Marking, + final_marking: Marking, + multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + variant_str: Optional[str] = None +) -> Dict[str, float]: """ - Calculates the fitness using alignments + Calculate the fitness using alignments. The output dictionary contains the following keys: - - average_trace_fitness (between 0.0 and 1.0; computed as average of the trace fitnesses) - - log_fitness (between 0.0 and 1.0) - - percentage_of_fitting_traces (the percentage of fit traces (from 0.0 to 100.0) - - Alignment-based replay aims to find one of the best alignment between the trace and the model. For each trace, the output of an alignment is a list of couples where the first element is an event (of the trace) or » and the second element is a transition (of the model) or ». For each couple, the following classification could be provided: - - - Sync move: the classification of the event corresponds to the transition label; in this case, both the trace and the model advance in the same way during the replay. - - Move on log: for couples where the second element is », it corresponds to a replay move in the trace that is not mimicked in the model. This kind of move is unfit and signal a deviation between the trace and the model. - - Move on model: for couples where the first element is », it corresponds to a replay move in the model that is not mimicked in the trace. For moves on model, we can have the following distinction: - * Moves on model involving hidden transitions: in this case, even if it is not a sync move, the move is fit. - * Moves on model not involving hidden transitions: in this case, the move is unfit and signals a deviation between the trace and the model. - - The calculation of the replay fitness aim to calculate how much of the behavior in the log is admitted by the process model. We propose two methods to calculate replay fitness, based on token-based replay and alignments respectively. - - For alignments, the percentage of traces that are completely fit is returned, along with a fitness value that is calculated as the average of the fitness values of the single traces. - - :param log: event log - :param petri_net: petri net - :param initial_marking: initial marking - :param final_marking: final marking - :param multi_processing: boolean value that enables the multiprocessing - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param variant_str: variant specification + - **average_trace_fitness**: Average of the trace fitnesses (between 0.0 and 1.0). + - **log_fitness**: Overall fitness of the log (between 0.0 and 1.0). + - **percentage_of_fitting_traces**: Percentage of fit traces (from 0.0 to 100.0). + + Alignment-based replay aims to find one of the best alignments between the trace and the model. For each trace, the output of an alignment is a list of pairs where the first element is an event (from the trace) or ``»`` and the second element is a transition (from the model) or ``»``. Each pair can be classified as follows: + + - **Sync move**: The event and transition labels correspond, advancing both the trace and the model simultaneously. + - **Move on log**: The transition is ``»``, indicating a replay move in the trace that is not mirrored in the model. This move is unfit and signals a deviation. + - **Move on model**: The event is ``»``, indicating a replay move in the model not mirrored in the trace. These can be further classified as: + * **Moves on model involving hidden transitions**: Even if it's not a sync move, the move is fit. + * **Moves on model not involving hidden transitions**: The move is unfit and signals a deviation. + + The calculation of replay fitness aims to assess how much of the behavior in the log is admitted by the process model. Two methods are proposed to calculate replay fitness, based on token-based replay and alignments respectively. + + For alignments, the percentage of traces that are completely fit is returned, along with a fitness value calculated as the average of the fitness values of the individual traces. + + :param log: Event log. + :param petri_net: Petri net. + :param initial_marking: Initial marking. + :param final_marking: Final marking. + :param multi_processing: Boolean to enable multiprocessing (default is `constants.ENABLE_MULTIPROCESSING_DEFAULT`). + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :param variant_str: Variant specification. + :return: A dictionary containing fitness metrics. :rtype: ``Dict[str, float]`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - fitness_alignments = pm4py.fitness_alignments(dataframe, net, im, fm, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + fitness_alignments = pm4py.fitness_alignments( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness - parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + parameters = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) parameters["multiprocessing"] = multi_processing - result = replay_fitness.apply(log, petri_net, initial_marking, final_marking, - variant=replay_fitness.Variants.ALIGNMENT_BASED, align_variant=variant_str, parameters=parameters) + result = replay_fitness.apply( + log, + petri_net, + initial_marking, + final_marking, + variant=replay_fitness.Variants.ALIGNMENT_BASED, + align_variant=variant_str, + parameters=parameters + ) return result -def precision_token_based_replay(log: Union[EventLog, pd.DataFrame], petri_net: PetriNet, initial_marking: Marking, - final_marking: Marking, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> float: +def precision_token_based_replay( + log: Union[EventLog, pd.DataFrame], + petri_net: PetriNet, + initial_marking: Marking, + final_marking: Marking, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> float: """ - Calculates the precision precision using token-based replay + Calculate precision using token-based replay. - Token-based replay matches a trace and a Petri net model, starting from the initial place, in order to discover which transitions are executed and in which places we have remaining or missing tokens for the given process instance. Token-based replay is useful for Conformance Checking: indeed, a trace is fitting according to the model if, during its execution, the transitions can be fired without the need to insert any missing token. If the reaching of the final marking is imposed, then a trace is fitting if it reaches the final marking without any missing or remaining tokens. + Token-based replay matches a trace against a Petri net model, starting from the initial marking, to discover which transitions are executed and in which places there are remaining or missing tokens for the given process instance. Token-based replay is useful for conformance checking: a trace fits the model if, during its execution, all transitions can be fired without the need to insert any missing tokens. If reaching the final marking is imposed, a trace fits if it reaches the final marking without any missing or remaining tokens. - In PM4Py there is an implementation of a token replayer that is able to go across hidden transitions (calculating shortest paths between places) and can be used with any Petri net model with unique visible transitions and hidden transitions. When a visible transition needs to be fired and not all places in the preset are provided with the correct number of tokens, starting from the current marking it is checked if for some place there is a sequence of hidden transitions that could be fired in order to enable the visible transition. The hidden transitions are then fired and a marking that permits to enable the visible transition is reached. + In PM4Py, the token replayer implementation can handle hidden transitions by calculating the shortest paths between places. It can be used with any Petri net model that has unique visible transitions and hidden transitions. When a visible transition needs to be fired and not all places in its preset have the correct number of tokens, the current marking is checked to see if any hidden transitions can be fired to enable the visible transition. The hidden transitions are then fired, reaching a marking that permits the firing of the visible transition. + The approach is described in: Berti, Alessandro, and Wil MP van der Aalst. "Reviving Token-based Replay: Increasing Speed While Improving Diagnostics." ATAED@ Petri Nets/ACSD. 2019. The reference paper for the TBR-based precision (ETConformance) is: Muñoz-Gama, Jorge, and Josep Carmona. "A fresh look at precision in process conformance." International Conference on Business Process Management. Springer, Berlin, Heidelberg, 2010. - In this approach, the different prefixes of the log are replayed (whether possible) on the model. At the reached marking, the set of transitions that are enabled in the process model is compared with the set of activities that follow the prefix. The more the sets are different, the more the precision value is low. The more the sets are similar, the more the precision value is high. + In this approach, the different prefixes of the log are replayed (if possible) on the model. At the reached marking, the set of transitions that are enabled in the process model is compared with the set of activities that follow the prefix. The more the sets differ, the lower the precision value. The more the sets are similar, the higher the precision value. - :param log: event log - :param petri_net: petri net - :param initial_marking: initial marking - :param final_marking: final marking - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log. + :param petri_net: Petri net. + :param initial_marking: Initial marking. + :param final_marking: Final marking. + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :return: The precision value. :rtype: ``float`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - precision_tbr = pm4py.precision_token_based_replay(dataframe, net, im, fm, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + precision_tbr = pm4py.precision_token_based_replay( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) - - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) + + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) from pm4py.algo.evaluation.precision import algorithm as precision_evaluator - result = precision_evaluator.apply(log, petri_net, initial_marking, final_marking, - variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN, parameters=properties) + result = precision_evaluator.apply( + log, + petri_net, + initial_marking, + final_marking, + variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN, + parameters=properties + ) return result -def precision_alignments(log: Union[EventLog, pd.DataFrame], petri_net: PetriNet, initial_marking: Marking, - final_marking: Marking, multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> float: +def precision_alignments( + log: Union[EventLog, pd.DataFrame], + petri_net: PetriNet, + initial_marking: Marking, + final_marking: Marking, + multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> float: """ - Calculates the precision of the model w.r.t. the event log using alignments + Calculate the precision of the model with respect to the event log using alignments. - Alignment-based replay aims to find one of the best alignment between the trace and the model. For each trace, the output of an alignment is a list of couples where the first element is an event (of the trace) or » and the second element is a transition (of the model) or ». For each couple, the following classification could be provided: + Alignment-based replay aims to find one of the best alignments between the trace and the model. For each trace, the output of an alignment is a list of pairs where the first element is an event (from the trace) or ``»`` and the second element is a transition (from the model) or ``»``. Each pair can be classified as follows: - - Sync move: the classification of the event corresponds to the transition label; in this case, both the trace and the model advance in the same way during the replay. - - Move on log: for couples where the second element is », it corresponds to a replay move in the trace that is not mimicked in the model. This kind of move is unfit and signal a deviation between the trace and the model. - - Move on model: for couples where the first element is », it corresponds to a replay move in the model that is not mimicked in the trace. For moves on model, we can have the following distinction: - * Moves on model involving hidden transitions: in this case, even if it is not a sync move, the move is fit. - * Moves on model not involving hidden transitions: in this case, the move is unfit and signals a deviation between the trace and the model. + - **Sync move**: The event and transition labels correspond, advancing both the trace and the model simultaneously. + - **Move on log**: The transition is ``»``, indicating a replay move in the trace that is not mirrored in the model. This move is unfit and signals a deviation. + - **Move on model**: The event is ``»``, indicating a replay move in the model not mirrored in the trace. These can be further classified as: + * **Moves on model involving hidden transitions**: Even if it's not a sync move, the move is fit. + * **Moves on model not involving hidden transitions**: The move is unfit and signals a deviation. The reference paper for the alignments-based precision (Align-ETConformance) is: - Adriansyah, Arya, et al. "Measuring precision of modeled behavior." Information systems and e-Business Management 13.1 (2015): 37-67 - - In this approach, the different prefixes of the log are replayed (whether possible) on the model. At the reached marking, the set of transitions that are enabled in the process model is compared with the set of activities that follow the prefix. The more the sets are different, the more the precision value is low. The more the sets are similar, the more the precision value is high. - - :param log: event log - :param petri_net: petri net - :param initial_marking: initial marking - :param final_marking: final marking - :param multi_processing: boolean value that enables the multiprocessing - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + Adriansyah, Arya, et al. "Measuring precision of modeled behavior." Information systems and e-Business Management 13.1 (2015): 37-67. + + In this approach, the different prefixes of the log are replayed (if possible) on the model. At the reached marking, the set of transitions that are enabled in the process model is compared with the set of activities that follow the prefix. The more the sets differ, the lower the precision value. The more the sets are similar, the higher the precision value. + + :param log: Event log. + :param petri_net: Petri net. + :param initial_marking: Initial marking. + :param final_marking: Final marking. + :param multi_processing: Boolean to enable multiprocessing (default is `constants.ENABLE_MULTIPROCESSING_DEFAULT`). + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :return: The precision value. :rtype: ``float`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - precision_alignments = pm4py.precision_alignments(dataframe, net, im, fm, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + precision_alignments = pm4py.precision_alignments( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) from pm4py.algo.evaluation.precision import algorithm as precision_evaluator - parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + parameters = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) parameters["multiprocessing"] = multi_processing - result = precision_evaluator.apply(log, petri_net, initial_marking, final_marking, - variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE, - parameters=parameters) + result = precision_evaluator.apply( + log, + petri_net, + initial_marking, + final_marking, + variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE, + parameters=parameters + ) return result -def generalization_tbr(log: Union[EventLog, pd.DataFrame], petri_net: PetriNet, initial_marking: Marking, - final_marking: Marking, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> float: +def generalization_tbr( + log: Union[EventLog, pd.DataFrame], + petri_net: PetriNet, + initial_marking: Marking, + final_marking: Marking, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> float: """ - Computes the generalization of the model (against the event log). The approach is described in the paper: - - Buijs, Joos CAM, Boudewijn F. van Dongen, and Wil MP van der Aalst. "Quality dimensions in process discovery: The importance of fitness, precision, generalization and simplicity." International Journal of Cooperative Information Systems 23.01 (2014): 1440001. - - - :param log: event log - :param petri_net: petri net - :param initial_marking: initial marking - :param final_marking: final marking - :param multi_processing: boolean value that enables the multiprocessing - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + Compute the generalization of the model against the event log. + The approach is described in the paper: + + Buijs, Joos CAM, Boudewijn F. van Dongen, and Wil MP van der Aalst. "Quality dimensions in process discovery: The importance of fitness, precision, generalization, and simplicity." International Journal of Cooperative Information Systems 23.01 (2014): 1440001. + + :param log: Event log. + :param petri_net: Petri net. + :param initial_marking: Initial marking. + :param final_marking: Final marking. + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :return: The generalization value. :rtype: ``float`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - generalization_tbr = pm4py.generalization_tbr(dataframe, net, im, fm) + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + generalization_tbr = pm4py.generalization_tbr( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator - parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) - result = generalization_evaluator.apply(log, petri_net, initial_marking, final_marking, variant=generalization_evaluator.Variants.GENERALIZATION_TOKEN, parameters=parameters) + parameters = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) + result = generalization_evaluator.apply( + log, + petri_net, + initial_marking, + final_marking, + variant=generalization_evaluator.Variants.GENERALIZATION_TOKEN, + parameters=parameters + ) return result -def replay_prefix_tbr(prefix: List[str], net: PetriNet, im: Marking, fm: Marking, activity_key: str = "concept:name") -> Marking: +def replay_prefix_tbr( + prefix: List[str], + net: PetriNet, + im: Marking, + fm: Marking, + activity_key: str = "concept:name" +) -> Marking: """ - Replays a prefix (list of activities) on a given accepting Petri net, using Token-Based Replay. - - :param prefix: list of activities - :param net: Petri net - :param im: initial marking - :param fm: final marking - :param activity_key: attribute to be used as activity - :rtype: ``Marking`` - - .. code-block:: python3 - + Replay a prefix (list of activities) on a given accepting Petri net using Token-Based Replay. + + :param prefix: List of activities representing the prefix. + :param net: Petri net. + :param im: Initial marking. + :param fm: Final marking. + :param activity_key: Attribute to be used as the activity key (default is "concept:name"). + :return: The marking reached after replaying the prefix. + :rtype: ``Marking`` + + Example: + ```python import pm4py net, im, fm = pm4py.read_pnml('tests/input_data/running-example.pnml') - marking = pm4py.replay_prefix_tbr(['register request', 'check ticket'], net, im, fm) + marking = pm4py.replay_prefix_tbr( + ['register request', 'check ticket'], + net, + im, + fm, + activity_key='concept:name' + ) + ``` """ purpose_log = EventLog() trace = Trace() @@ -489,17 +805,25 @@ def replay_prefix_tbr(prefix: List[str], net: PetriNet, im: Marking, fm: Marking return res["reached_marking"] -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="conformance checking using footprints will not be exposed in a future release") +@deprecation.deprecated( + deprecated_in="2.3.0", + removed_in="3.0.0", + details="Conformance checking using footprints will not be exposed in a future release." +) def __convert_to_fp(*args) -> Union[List[Dict[str, Any]], Dict[str, Any]]: """ - Internal method to convert the provided event log / process model argument - to footprints (using footprints discovery) + Internal method to convert the provided event log or process model arguments + to footprints using footprints discovery. - :param args: event log / process model + :param args: Event log or process model. + :return: Footprints representation. :rtype: ``Union[List[Dict[str, Any]], Dict[str, Any]]`` + + Note: + This is an internal method and is deprecated. """ import pm4py - while type(args) is tuple: + while isinstance(args, tuple): if len(args) == 1: args = args[0] else: @@ -511,48 +835,95 @@ def __convert_to_fp(*args) -> Union[List[Dict[str, Any]], Dict[str, Any]]: return fp -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="conformance checking using footprints will not be exposed in a future release") +@deprecation.deprecated( + deprecated_in="2.3.0", + removed_in="3.0.0", + details="Conformance checking using footprints will not be exposed in a future release." +) def conformance_diagnostics_footprints(*args) -> Union[List[Dict[str, Any]], Dict[str, Any]]: """ - Provide conformance checking diagnostics using footprints + Provide conformance checking diagnostics using footprints. - :param args: provided arguments (the first argument is supposed to be an event log (or the footprints discovered from the event log); the other arguments are supposed to be the process model (or the footprints discovered from the process model). + :param args: Arguments where the first is an event log (or its footprints) and the others represent the process model (or its footprints). + :return: Conformance diagnostics based on footprints. :rtype: ``Union[List[Dict[str, Any]], Dict[str, Any]]`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - footprints_diagnostics = pm4py.conformance_diagnostics_footprints(dataframe, net, im, fm, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + footprints_diagnostics = pm4py.conformance_diagnostics_footprints( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ fp1 = __convert_to_fp(args[0]) fp2 = __convert_to_fp(args[1:]) from pm4py.algo.conformance.footprints import algorithm as footprints_conformance if isinstance(fp1, list): - result = footprints_conformance.apply(fp1, fp2, variant=footprints_conformance.Variants.TRACE_EXTENSIVE) + result = footprints_conformance.apply( + fp1, + fp2, + variant=footprints_conformance.Variants.TRACE_EXTENSIVE + ) else: - result = footprints_conformance.apply(fp1, fp2, variant=footprints_conformance.Variants.LOG_EXTENSIVE) + result = footprints_conformance.apply( + fp1, + fp2, + variant=footprints_conformance.Variants.LOG_EXTENSIVE + ) return result -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="conformance checking using footprints will not be exposed in a future release") +@deprecation.deprecated( + deprecated_in="2.3.0", + removed_in="3.0.0", + details="Conformance checking using footprints will not be exposed in a future release." +) def fitness_footprints(*args) -> Dict[str, float]: """ - Calculates fitness using footprints. The output is a dictionary containing two keys: - - perc_fit_traces => percentage of fit traces (over the log) - - log_fitness => the fitness value over the log + Calculate fitness using footprints. + The output is a dictionary containing two keys: + - **perc_fit_traces**: Percentage of fit traces (over the log). + - **log_fitness**: The fitness value over the log. - :param args: provided arguments (the first argument is supposed to be an event log (or the footprints discovered from the event log); the other arguments are supposed to be the process model (or the footprints discovered from the process model). + :param args: Arguments where the first is an event log (or its footprints) and the others represent the process model (or its footprints). + :return: A dictionary containing fitness metrics based on footprints. :rtype: ``Dict[str, float]`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - fitness_fp = pm4py.fitness_footprints(dataframe, net, im, fm, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + fitness_fp = pm4py.fitness_footprints( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ fp_conf = conformance_diagnostics_footprints(*args) fp1 = __convert_to_fp(args[0]) @@ -563,20 +934,39 @@ def fitness_footprints(*args) -> Dict[str, float]: return result -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="conformance checking using footprints will not be exposed in a future release") +@deprecation.deprecated( + deprecated_in="2.3.0", + removed_in="3.0.0", + details="Conformance checking using footprints will not be exposed in a future release." +) def precision_footprints(*args) -> float: """ - Calculates precision using footprints + Calculate precision using footprints. - :param args: provided arguments (the first argument is supposed to be an event log (or the footprints discovered from the event log); the other arguments are supposed to be the process model (or the footprints discovered from the process model). + :param args: Arguments where the first is an event log (or its footprints) and the others represent the process model (or its footprints). + :return: The precision value based on footprints. :rtype: ``float`` - .. code-block:: python3 - + Example: + ```python import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - precision_fp = pm4py.precision_footprints(dataframe, net, im, fm, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + precision_fp = pm4py.precision_footprints( + dataframe, + net, + im, + fm, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ fp1 = __convert_to_fp(args[0]) fp2 = __convert_to_fp(args[1:]) @@ -586,14 +976,22 @@ def precision_footprints(*args) -> float: return result -@deprecation.deprecated(removed_in="2.3.0", deprecated_in="3.0.0", details="this method will be removed in a future release.") +@deprecation.deprecated( + removed_in="2.3.0", + deprecated_in="3.0.0", + details="This method will be removed in a future release." +) def __check_is_fit_process_tree(trace, tree) -> bool: """ - Check if a trace object is fit against a process tree model + Check if a trace object fits a process tree model. - :param trace: trace - :param tree: process tree + :param trace: Trace. + :param tree: Process tree. + :return: True if the trace fits the process tree, False otherwise. :rtype: ``bool`` + + Note: + This is an internal method and is deprecated. """ __event_log_deprecation_warning(trace) @@ -603,66 +1001,110 @@ def __check_is_fit_process_tree(trace, tree) -> bool: fp_tree = discover_footprints(tree) fp_log = discover_footprints(log) fp_conf_res = conformance_diagnostics_footprints(fp_log, fp_tree)[0] - # CHECK 1) if footprints already say is not fit, then return False - # (if they say True, it might be a false positive) + # CHECK 1) If footprints indicate non-conformance, return False if not fp_conf_res["is_footprints_fit"]: return False else: from pm4py.convert import convert_to_petri_net net, im, fm = convert_to_petri_net(tree) - tbr_conf_res = conformance_diagnostics_token_based_replay(log, net, im, fm, return_diagnostics_dataframe=False)[0] - # CHECK 2) if TBR says that is fit, then return True - # (if they say False, it might be a false negative) + tbr_conf_res = conformance_diagnostics_token_based_replay( + log, + net, + im, + fm, + return_diagnostics_dataframe=False + )[0] + # CHECK 2) If TBR indicates fit, return True if tbr_conf_res["trace_is_fit"]: return True else: - # CHECK 3) alignments definitely say if the trace is fit or not if the previous methods fail - align_conf_res = conformance_diagnostics_alignments(log, tree, return_diagnostics_dataframe=False)[0] + # CHECK 3) Use alignments for definitive fit assessment + align_conf_res = conformance_diagnostics_alignments( + log, + tree, + return_diagnostics_dataframe=False + )[0] return align_conf_res["fitness"] == 1.0 -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="this method will be removed in a future release.") -def __check_is_fit_petri_net(trace, net, im, fm, activity_key=xes_constants.DEFAULT_NAME_KEY) -> bool: +@deprecation.deprecated( + removed_in="2.3.0", + deprecated_in="3.0.0", + details="This method will be removed in a future release." +) +def __check_is_fit_petri_net( + trace, + net: PetriNet, + im: Marking, + fm: Marking, + activity_key=xes_constants.DEFAULT_NAME_KEY +) -> bool: """ - Checks if a trace object is fit against Petri net object - - :param trace: trace - :param net: petri net - :param im: initial marking - :param fm: final marking - :param activity_key: attribute to be used as activity + Check if a trace object fits a Petri net model. + + :param trace: Trace. + :param net: Petri net. + :param im: Initial marking. + :param fm: Final marking. + :param activity_key: Attribute to be used as the activity key (default is defined in `xes_constants.DEFAULT_NAME_KEY`). + :return: True if the trace fits the Petri net, False otherwise. :rtype: ``bool`` + + Note: + This is an internal method and is deprecated. """ __event_log_deprecation_warning(trace) - # avoid checking footprints on Petri net (they are too slow) - activities_model = set(trans.label for trans in net.transitions if trans.label is not None) + # Avoid checking footprints on Petri net (they are too slow) + activities_model = set( + trans.label for trans in net.transitions if trans.label is not None + ) activities_trace = set([x[activity_key] for x in trace]) diff = activities_trace.difference(activities_model) if diff: - # CHECK 1) there are activities in the trace that are not in the model + # CHECK 1) If there are activities in the trace not present in the model, return False return False else: log = EventLog() log.append(trace) - tbr_conf_res = conformance_diagnostics_token_based_replay(log, net, im, fm, return_diagnostics_dataframe=False)[0] - # CHECK 2) if TBR says that is fit, then return True - # (if they say False, it might be a false negative) + tbr_conf_res = conformance_diagnostics_token_based_replay( + log, net, im, fm, + return_diagnostics_dataframe=False + )[0] + # CHECK 2) If TBR indicates fit, return True if tbr_conf_res["trace_is_fit"]: return True else: - # CHECK 3) alignments definitely say if the trace is fit or not if the previous methods fail - align_conf_res = conformance_diagnostics_alignments(log, net, im, fm, return_diagnostics_dataframe=False)[0] + # CHECK 3) Use alignments for definitive fit assessment + align_conf_res = conformance_diagnostics_alignments( + log, + net, + im, + fm, + return_diagnostics_dataframe=False + )[0] return align_conf_res["fitness"] == 1.0 -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="this method will be removed in a future release.") -def check_is_fitting(*args, activity_key=xes_constants.DEFAULT_NAME_KEY) -> bool: +@deprecation.deprecated( + deprecated_in="2.3.0", + removed_in="3.0.0", + details="This method will be removed in a future release." +) +def check_is_fitting( + *args, + activity_key=xes_constants.DEFAULT_NAME_KEY +) -> bool: """ - Checks if a trace object is fit against a process model + Check if a trace object fits a process model. - :param args: arguments (trace object; process model (process tree, petri net, BPMN)) + :param args: Arguments where the first is a trace object and the others represent the process model (process tree, Petri net, BPMN). + :param activity_key: Attribute to be used as the activity key (default is defined in `xes_constants.DEFAULT_NAME_KEY`). + :return: True if the trace fits the process model, False otherwise. :rtype: ``bool`` + + Note: + This is an internal method and is deprecated. """ from pm4py.util import variants_util from pm4py.convert import convert_to_process_tree, convert_to_petri_net @@ -673,7 +1115,7 @@ def check_is_fitting(*args, activity_key=xes_constants.DEFAULT_NAME_KEY) -> bool try: model = convert_to_process_tree(*model) except: - # the model cannot be expressed as a process tree, let's say if at least can be expressed as a Petri net + # If the model cannot be expressed as a process tree, attempt Petri net conversion model = convert_to_petri_net(*model) if not isinstance(trace, Trace): @@ -685,153 +1127,279 @@ def check_is_fitting(*args, activity_key=xes_constants.DEFAULT_NAME_KEY) -> bool if isinstance(model, ProcessTree): return __check_is_fit_process_tree(trace, model) elif isinstance(model, tuple) and isinstance(model[0], PetriNet): - return __check_is_fit_petri_net(trace, model[0], model[1], model[2], activity_key=activity_key) - - -def conformance_temporal_profile(log: Union[EventLog, pd.DataFrame], temporal_profile: Dict[Tuple[str, str], Tuple[float, float]], zeta: float = 1.0, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME) -> List[List[Tuple[float, float, float, float]]]: + return __check_is_fit_petri_net( + trace, + model[0], + model[1], + model[2], + activity_key=activity_key + ) + + +def conformance_temporal_profile( + log: Union[EventLog, pd.DataFrame], + temporal_profile: Dict[Tuple[str, str], Tuple[float, float]], + zeta: float = 1.0, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME +) -> List[List[Tuple[float, float, float, float]]]: """ - Performs conformance checking on the provided log with the provided temporal profile. + Perform conformance checking on the provided log using the provided temporal profile. The result is a list of time-based deviations for every case. - E.g. if the log on top of which the conformance is applied is the following (1 case): - A (timestamp: 2000-01) B (timestamp: 2002-01) - The difference between the timestamps of A and B is two years. If the temporal profile: - {('A', 'B'): (1.5 months, 0.5 months), ('A', 'C'): (5 months, 0), ('A', 'D'): (2 months, 0)} - is specified, and zeta is set to 1, then the aforementioned case would be deviating - (considering the couple of activities ('A', 'B')), because 2 years > 1.5 months + 0.5 months. - - :param log: log object - :param temporal_profile: temporal profile. E.g., if the log has two cases: A (timestamp: 1980-01) B (timestamp: 1980-03) C (timestamp: 1980-06); A (timestamp: 1990-01) B (timestamp: 1990-02) D (timestamp: 1990-03); The temporal profile will contain: {('A', 'B'): (1.5 months, 0.5 months), ('A', 'C'): (5 months, 0), ('A', 'D'): (2 months, 0)} - :param zeta: number of standard deviations allowed from the average. E.g. zeta=1 allows every timestamp between AVERAGE-STDEV and AVERAGE+STDEV. - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param return_diagnostics_dataframe: if possible, returns a dataframe with the diagnostics (instead of the usual output) - :rtype: ``List[List[Tuple[float, float, float, float]]]`` - .. code-block:: python3 + For example, consider a log with a single case: + - A (timestamp: 2000-01) + - B (timestamp: 2002-01) + + Given the temporal profile: + ```python + { + ('A', 'B'): (1.5, 0.5), # (mean, std) + ('A', 'C'): (5.0, 0.0), + ('A', 'D'): (2.0, 0.0) + } + ``` + and setting `zeta` to 1, the difference between the timestamps of A and B (2 years) exceeds the allowed time (1.5 months + 0.5 months), resulting in a deviation. + + :param log: Log object. + :param temporal_profile: Temporal profile. For example, if the log has two cases: + - Case 1: A (timestamp: 1980-01), B (timestamp: 1980-03), C (timestamp: 1980-06) + - Case 2: A (timestamp: 1990-01), B (timestamp: 1990-02), D (timestamp: 1990-03) + The temporal profile might look like: + ```python + { + ('A', 'B'): (1.5, 0.5), # (mean, std) + ('A', 'C'): (5.0, 0.0), + ('A', 'D'): (2.0, 0.0) + } + ``` + :param zeta: Number of standard deviations allowed from the average (default is 1.0). For example, `zeta=1` allows deviations within one standard deviation from the mean. + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :param return_diagnostics_dataframe: If possible, returns a dataframe with the diagnostics instead of the usual output (default is `constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME`). + :return: A list containing lists of tuples representing time-based deviations for each case. + :rtype: ``List[List[Tuple[float, float, float, float]]]`` + Example: + ```python import pm4py - temporal_profile = pm4py.discover_temporal_profile(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - conformance_temporal_profile = pm4py.conformance_temporal_profile(dataframe, temporal_profile, zeta=1, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + temporal_profile = pm4py.discover_temporal_profile( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + conformance_temporal_profile = pm4py.conformance_temporal_profile( + dataframe, + temporal_profile, + zeta=1, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) - - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) + + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) properties["zeta"] = zeta from pm4py.algo.conformance.temporal_profile import algorithm as temporal_profile_conformance - result = temporal_profile_conformance.apply(log, temporal_profile, parameters=properties) + result = temporal_profile_conformance.apply( + log, + temporal_profile, + parameters=properties + ) if return_diagnostics_dataframe: - return temporal_profile_conformance.get_diagnostics_dataframe(log, result, parameters=properties) + return temporal_profile_conformance.get_diagnostics_dataframe( + log, + result, + parameters=properties + ) return result -def conformance_declare(log: Union[EventLog, pd.DataFrame], declare_model: Dict[str, Dict[Any, Dict[str, int]]], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME) -> List[Dict[str, Any]]: +def conformance_declare( + log: Union[EventLog, pd.DataFrame], + declare_model: Dict[str, Dict[Any, Dict[str, int]]], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME +) -> List[Dict[str, Any]]: """ - Applies conformance checking against a DECLARE model. + Apply conformance checking against a DECLARE model. Reference paper: - F. M. Maggi, A. J. Mooij and W. M. P. van der Aalst, "User-guided discovery of declarative process models," 2011 IEEE Symposium on Computational Intelligence and Data Mining (CIDM), Paris, France, 2011, pp. 192-199, doi: 10.1109/CIDM.2011.5949297. - - :param log: event log - :param declare_model: DECLARE model - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param return_diagnostics_dataframe: if possible, returns a dataframe with the diagnostics (instead of the usual output) + F. M. Maggi, A. J. Mooij, and W. M. P. van der Aalst, "User-guided discovery of declarative process models," 2011 IEEE Symposium on Computational Intelligence and Data Mining (CIDM), Paris, France, 2011, pp. 192-199, doi: 10.1109/CIDM.2011.5949297. + + :param log: Event log. + :param declare_model: DECLARE model represented as a nested dictionary. + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :param return_diagnostics_dataframe: If possible, returns a dataframe with the diagnostics instead of the usual output (default is `constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME`). + :return: A list of dictionaries containing diagnostics for each trace. :rtype: ``List[Dict[str, Any]]`` - .. code-block:: python3 - + Example: + ```python import pm4py log = pm4py.read_xes("C:/receipt.xes") declare_model = pm4py.discover_declare(log) - conf_result = pm4py.conformance_declare(log, declare_model) + conf_result = pm4py.conformance_declare( + log, + declare_model, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, - case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) if return_diagnostics_dataframe: from pm4py.convert import convert_to_event_log log = convert_to_event_log(log, case_id_key=case_id_key) case_id_key = None - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) from pm4py.algo.conformance.declare import algorithm as declare_conformance - result = declare_conformance.apply(log, declare_model, parameters=properties) + result = declare_conformance.apply( + log, + declare_model, + parameters=properties + ) if return_diagnostics_dataframe: - return declare_conformance.get_diagnostics_dataframe(log, result, parameters=properties) + return declare_conformance.get_diagnostics_dataframe( + log, + result, + parameters=properties + ) return result -def conformance_log_skeleton(log: Union[EventLog, pd.DataFrame], log_skeleton: Dict[str, Any], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME) -> List[Set[Any]]: +def conformance_log_skeleton( + log: Union[EventLog, pd.DataFrame], + log_skeleton: Dict[str, Any], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + return_diagnostics_dataframe: bool = constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME +) -> List[Set[Any]]: """ - Performs conformance checking using the log skeleton + Perform conformance checking using the log skeleton. Reference paper: Verbeek, H. M. W., and R. Medeiros de Carvalho. "Log skeletons: A classification approach to process discovery." arXiv preprint arXiv:1806.08247 (2018). - A log skeleton is a declarative model which consists of six different constraints: - - "directly_follows": specifies for some activities some strict bounds on the activities directly-following. For example, - 'A should be directly followed by B' and 'B should be directly followed by C'. - - "always_before": specifies that some activities may be executed only if some other activities are executed somewhen before - in the history of the case. - For example, 'C should always be preceded by A' - - "always_after": specifies that some activities should always trigger the execution of some other activities - in the future history of the case. - For example, 'A should always be followed by C' - - "equivalence": specifies that a given couple of activities should happen with the same number of occurrences inside - a case. - For example, 'B and C should always happen the same number of times'. - - "never_together": specifies that a given couple of activities should never happen together in the history of the case. - For example, 'there should be no case containing both C and D'. - - "activ_occurrences": specifies the allowed number of occurrences per activity: - E.g. A is allowed to be executed 1 or 2 times, B is allowed to be executed 1 or 2 or 3 or 4 times. - - :param log: log object - :param log_skeleton: log skeleton object, expressed as dictionaries of the six constraints (never_together, always_before ...) along with the discovered rules. - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param return_diagnostics_dataframe: if possible, returns a dataframe with the diagnostics (instead of the usual output) + A log skeleton is a declarative model consisting of six different constraints: + - **directly_follows**: Specifies strict bounds on activities directly following each other. For example, 'A should be directly followed by B' and 'B should be directly followed by C'. + - **always_before**: Specifies that certain activities may only be executed if some other activities have been executed earlier in the case history. For example, 'C should always be preceded by A'. + - **always_after**: Specifies that certain activities should always trigger the execution of other activities in the future history of the case. For example, 'A should always be followed by C'. + - **equivalence**: Specifies that pairs of activities should occur the same number of times within a case. For example, 'B and C should always happen the same number of times'. + - **never_together**: Specifies that certain pairs of activities should never occur together in the case history. For example, 'No case should contain both C and D'. + - **activ_occurrences**: Specifies the allowed number of occurrences per activity. For example, 'A is allowed to be executed 1 or 2 times, and B is allowed to be executed 1 to 4 times'. + + :param log: Log object. + :param log_skeleton: Log skeleton object, expressed as dictionaries of the six constraints along with the discovered rules. + :param activity_key: Attribute to be used for the activity (default is "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default is "time:timestamp"). + :param case_id_key: Attribute to be used as the case identifier (default is "case:concept:name"). + :param return_diagnostics_dataframe: If possible, returns a dataframe with the diagnostics instead of the usual output (default is `constants.DEFAULT_RETURN_DIAGNOSTICS_DATAFRAME`). + :return: A list of sets containing deviations for each case. :rtype: ``List[Set[Any]]`` - .. code-block:: python3 - + Example: + ```python import pm4py - log_skeleton = pm4py.discover_log_skeleton(dataframe, noise_threshold=0.1, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - conformance_lsk = pm4py.conformance_log_skeleton(dataframe, log_skeleton, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + log_skeleton = pm4py.discover_log_skeleton( + dataframe, + noise_threshold=0.1, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + conformance_lsk = pm4py.conformance_log_skeleton( + dataframe, + log_skeleton, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + ``` """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) if return_diagnostics_dataframe: from pm4py.convert import convert_to_event_log log = convert_to_event_log(log, case_id_key=case_id_key) case_id_key = None - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) from pm4py.algo.conformance.log_skeleton import algorithm as log_skeleton_conformance - result = log_skeleton_conformance.apply(log, log_skeleton, parameters=properties) + result = log_skeleton_conformance.apply( + log, + log_skeleton, + parameters=properties + ) if return_diagnostics_dataframe: - return log_skeleton_conformance.get_diagnostics_dataframe(log, result, parameters=properties) + return log_skeleton_conformance.get_diagnostics_dataframe( + log, + result, + parameters=properties + ) return result diff --git a/pm4py/connectors.py b/pm4py/connectors.py index 19b3c80cd..8d58d92ee 100644 --- a/pm4py/connectors.py +++ b/pm4py/connectors.py @@ -27,18 +27,18 @@ def extract_log_outlook_mails() -> pd.DataFrame: """ - Extracts the history of the conversations from the local instance of Microsoft Outlook + Extracts the history of conversations from the local instance of Microsoft Outlook running on the current computer. - CASE ID (case:concept:name) => identifier of the conversation - ACTIVITY (concept:name) => activity that is performed in the current item (send e-mail, receive e-mail, - refuse meeting ...) - TIMESTAMP (time:timestamp) => timestamp of creation of the item in Outlook - RESOURCE (org:resource) => sender of the current item + Columns: + - **CASE ID (case:concept:name)**: Identifier of the conversation. + - **ACTIVITY (concept:name)**: Activity performed in the current item (e.g., send e-mail, receive e-mail, refuse meeting). + - **TIMESTAMP (time:timestamp)**: Timestamp of creation of the item in Outlook. + - **RESOURCE (org:resource)**: Sender of the current item. See also: - * https://learn.microsoft.com/en-us/dotnet/api/microsoft.office.interop.outlook.mailitem?redirectedfrom=MSDN&view=outlook-pia#properties_ - * https://learn.microsoft.com/en-us/dotnet/api/microsoft.office.interop.outlook.olobjectclass?view=outlook-pia + * [MailItem Properties](https://learn.microsoft.com/en-us/dotnet/api/microsoft.office.interop.outlook.mailitem?redirectedfrom=MSDN&view=outlook-pia#properties_) + * [OlObjectClass Enumeration](https://learn.microsoft.com/en-us/dotnet/api/microsoft.office.interop.outlook.olobjectclass?view=outlook-pia) :rtype: ``pd.DataFrame`` @@ -53,23 +53,27 @@ def extract_log_outlook_mails() -> pd.DataFrame: def extract_log_outlook_calendar(email_user: Optional[str] = None, calendar_id: int = 9) -> pd.DataFrame: """ - Extracts the history of the calendar events (creation, update, start, end) - in a Pandas dataframe from the local Outlook instance running on the current computer. + Extracts the history of calendar events (creation, update, start, end) + into a Pandas DataFrame from the local Outlook instance running on the current computer. - CASE ID (case:concept:name) => identifier of the meeting - ACTIVITY (concept:name) => one between: Meeting Created, Last Change of Meeting, Meeting Started, Meeting Completed - TIMESTAMP (time:timestamp) => the timestamp of the event - case:subject => the subject of the meeting + Columns: + - **CASE ID (case:concept:name)**: Identifier of the meeting. + - **ACTIVITY (concept:name)**: One of the following activities: Meeting Created, Last Change of Meeting, Meeting Started, Meeting Completed. + - **TIMESTAMP (time:timestamp)**: Timestamp of the event. + - **case:subject**: Subject of the meeting. - :param email_user: (optional) e-mail address from which the (shared) calendar should be extracted - :param calendar_id: identifier of the calendar for the given user (default: 9) + :param email_user: (optional) E-mail address from which the (shared) calendar should be extracted. + :param calendar_id: Identifier of the calendar for the given user (default: 9). :rtype: ``pd.DataFrame`` .. code-block:: python3 import pm4py + # Extract using default parameters dataframe = pm4py.connectors.extract_log_outlook_calendar() + + # Extract using a specific email user dataframe = pm4py.connectors.extract_log_outlook_calendar("vacation-calendar@workplace.eu") """ from pm4py.algo.connectors.variants import outlook_calendar @@ -81,13 +85,14 @@ def extract_log_outlook_calendar(email_user: Optional[str] = None, calendar_id: def extract_log_windows_events() -> pd.DataFrame: """ - Extract a process mining dataframe from all the events recorded in the Windows registry. + Extracts a process mining DataFrame from all events recorded in the Windows registry. - CASE ID (case:concept:name) => name of the computer emitting the events. - ACTIVITY (concept:name) => concatenation of the source name of the event and the event identifier - (see https://learn.microsoft.com/en-us/previous-versions/windows/desktop/eventlogprov/win32-ntlogevent) - TIMESTAMP (time:timestamp) => timestamp of generation of the event - RESOURCE (org:resource) => username involved in the event + Columns: + - **CASE ID (case:concept:name)**: Name of the computer emitting the events. + - **ACTIVITY (concept:name)**: Concatenation of the source name of the event and the event identifier. + (See [Win32_NTLogEvent](https://learn.microsoft.com/en-us/previous-versions/windows/desktop/eventlogprov/win32-ntlogevent)) + - **TIMESTAMP (time:timestamp)**: Timestamp of event generation. + - **RESOURCE (org:resource)**: Username involved in the event. :rtype: ``pd.DataFrame`` @@ -102,14 +107,15 @@ def extract_log_windows_events() -> pd.DataFrame: def extract_log_chrome_history(history_db_path: Optional[str] = None) -> pd.DataFrame: """ - Extracts a dataframe containing the navigation history of Google Chrome. - Please keep Google Chrome history closed when extracting. + Extracts a DataFrame containing the navigation history of Google Chrome. + Please ensure that Google Chrome history is closed when extracting. - CASE ID (case:concept:name) => an identifier of the profile that has been extracted - ACTIVITY (concept:name) => the complete path of the website, minus the GET arguments - TIMESTAMP (time:timestamp) => the timestamp of visit + Columns: + - **CASE ID (case:concept:name)**: Identifier of the extracted profile. + - **ACTIVITY (concept:name)**: Complete path of the website, excluding GET arguments. + - **TIMESTAMP (time:timestamp)**: Timestamp of the visit. - :param history_db_path: path to the history DB path of Google Chrome (default: position of the Windows folder) + :param history_db_path: Path to the Google Chrome history database (default: location of the Windows folder). :rtype: ``pd.DataFrame`` .. code-block:: python3 @@ -126,14 +132,15 @@ def extract_log_chrome_history(history_db_path: Optional[str] = None) -> pd.Data def extract_log_firefox_history(history_db_path: Optional[str] = None) -> pd.DataFrame: """ - Extracts a dataframe containing the navigation history of Mozilla Firefox. - Please keep Google Chrome history closed when extracting. + Extracts a DataFrame containing the navigation history of Mozilla Firefox. + Please ensure that Mozilla Firefox history is closed when extracting. - CASE ID (case:concept:name) => an identifier of the profile that has been extracted - ACTIVITY (concept:name) => the complete path of the website, minus the GET arguments - TIMESTAMP (time:timestamp) => the timestamp of visit + Columns: + - **CASE ID (case:concept:name)**: Identifier of the extracted profile. + - **ACTIVITY (concept:name)**: Complete path of the website, excluding GET arguments. + - **TIMESTAMP (time:timestamp)**: Timestamp of the visit. - :param history_db_path: path to the history DB path of Mozilla Firefox (default: position of the Windows folder) + :param history_db_path: Path to the Mozilla Firefox history database (default: location of the Windows folder). :rtype: ``pd.DataFrame`` .. code-block:: python3 @@ -150,13 +157,12 @@ def extract_log_firefox_history(history_db_path: Optional[str] = None) -> pd.Dat def extract_log_github(owner: str = "pm4py", repo: str = "pm4py-core", auth_token: Optional[str] = None) -> pd.DataFrame: """ - Extracts a dataframe containing the history of the issues of a Github repository. - According to the API limit rate of public/registered users, only a part of the events - can be returned. + Extracts a DataFrame containing the history of issues from a GitHub repository. + Due to API rate limits for public and registered users, only a subset of events may be returned. - :param owner: owner of the repository (e.g., pm4py) - :param repo: name of the repository (e.g., pm4py-core) - :param auth_token: authorization token + :param owner: Owner of the repository (e.g., pm4py). + :param repo: Name of the repository (e.g., pm4py-core). + :param auth_token: Authorization token. :rtype: ``pd.DataFrame`` .. code-block:: python3 @@ -174,16 +180,18 @@ def extract_log_github(owner: str = "pm4py", repo: str = "pm4py-core", auth_toke def extract_log_camunda_workflow(connection_string: str) -> pd.DataFrame: """ - Extracts a dataframe from the Camunda workflow system. Aside from the traditional columns, - the processID of the process in Camunda is returned. + Extracts a DataFrame from the Camunda workflow system. In addition to traditional columns, + the process ID of the process in Camunda is included. - :param connection_string: ODBC connection string to the Camunda database + :param connection_string: ODBC connection string to the Camunda database. :rtype: ``pd.DataFrame`` .. code-block:: python3 import pm4py - dataframe = pm4py.connectors.extract_log_camunda_workflow('Driver={PostgreSQL Unicode(x64)};SERVER=127.0.0.3;DATABASE=process-engine;UID=xx;PWD=yy') + dataframe = pm4py.connectors.extract_log_camunda_workflow( + 'Driver={PostgreSQL Unicode(x64)};SERVER=127.0.0.3;DATABASE=process-engine;UID=xx;PWD=yy' + ) """ from pm4py.algo.connectors.variants import camunda_workflow parameters = {} @@ -193,16 +201,18 @@ def extract_log_camunda_workflow(connection_string: str) -> pd.DataFrame: def extract_log_sap_o2c(connection_string: str, prefix: str = "") -> pd.DataFrame: """ - Extracts a dataframe for the SAP O2C process. + Extracts a DataFrame for the SAP Order-to-Cash (O2C) process. - :param connection_string: ODBC connection string to the SAP database - :param prefix: prefix for the tables (example: SAPSR3.) + :param connection_string: ODBC connection string to the SAP database. + :param prefix: Prefix for the tables (e.g., SAPSR3.). :rtype: ``pd.DataFrame`` .. code-block:: python3 import pm4py - dataframe = pm4py.connectors.extract_log_sap_o2c('Driver={Oracle in instantclient_21_6};DBQ=127.0.0.3:1521/ZIB;UID=xx;PWD=yy') + dataframe = pm4py.connectors.extract_log_sap_o2c( + 'Driver={Oracle in instantclient_21_6};DBQ=127.0.0.3:1521/ZIB;UID=xx;PWD=yy' + ) """ from pm4py.algo.connectors.variants import sap_o2c parameters = {} @@ -213,16 +223,18 @@ def extract_log_sap_o2c(connection_string: str, prefix: str = "") -> pd.DataFram def extract_log_sap_accounting(connection_string: str, prefix: str = "") -> pd.DataFrame: """ - Extracts a dataframe for the SAP Accounting process. + Extracts a DataFrame for the SAP Accounting process. - :param connection_string: ODBC connection string to the SAP database - :param prefix: prefix for the tables (example: SAPSR3.) + :param connection_string: ODBC connection string to the SAP database. + :param prefix: Prefix for the tables (e.g., SAPSR3.). :rtype: ``pd.DataFrame`` .. code-block:: python3 import pm4py - dataframe = pm4py.connectors.extract_log_sap_accounting('Driver={Oracle in instantclient_21_6};DBQ=127.0.0.3:1521/ZIB;UID=xx;PWD=yy') + dataframe = pm4py.connectors.extract_log_sap_accounting( + 'Driver={Oracle in instantclient_21_6};DBQ=127.0.0.3:1521/ZIB;UID=xx;PWD=yy' + ) """ from pm4py.algo.connectors.variants import sap_accounting parameters = {} @@ -233,21 +245,21 @@ def extract_log_sap_accounting(connection_string: str, prefix: str = "") -> pd.D def extract_ocel_outlook_mails() -> OCEL: """ - Extracts the history of the conversations from the local instance of Microsoft Outlook + Extracts the history of conversations from the local instance of Microsoft Outlook running on the current computer as an object-centric event log. - ACTIVITY (ocel:activity) => activity that is performed in the current item (send e-mail, receive e-mail, - refuse meeting ...) - TIMESTAMP (ocel:timestamp) => timestamp of creation of the item in Outlook + Columns: + - **ACTIVITY (ocel:activity)**: Activity performed in the current item (e.g., send e-mail, receive e-mail, refuse meeting). + - **TIMESTAMP (ocel:timestamp)**: Timestamp of creation of the item in Outlook. - Object types: - - org:resource => the snder of the mail - - recipients => the list of recipients of the mail - - topic => the topic of the discussion + Object Types: + - **org:resource**: Sender of the mail. + - **recipients**: List of recipients of the mail. + - **topic**: Topic of the discussion. See also: - * https://learn.microsoft.com/en-us/dotnet/api/microsoft.office.interop.outlook.mailitem?redirectedfrom=MSDN&view=outlook-pia#properties_ - * https://learn.microsoft.com/en-us/dotnet/api/microsoft.office.interop.outlook.olobjectclass?view=outlook-pia + * [MailItem Properties](https://learn.microsoft.com/en-us/dotnet/api/microsoft.office.interop.outlook.mailitem?redirectedfrom=MSDN&view=outlook-pia#properties_) + * [OlObjectClass Enumeration](https://learn.microsoft.com/en-us/dotnet/api/microsoft.office.interop.outlook.olobjectclass?view=outlook-pia) :rtype: ``OCEL`` @@ -258,53 +270,66 @@ def extract_ocel_outlook_mails() -> OCEL: """ import pm4py dataframe = pm4py.connectors.extract_log_outlook_mails() - return pm4py.convert_log_to_ocel(dataframe, "concept:name", "time:timestamp", ["org:resource", "recipients", "topic"]) + return pm4py.convert_log_to_ocel( + dataframe, + case_id_col="concept:name", + timestamp_col="time:timestamp", + object_attributes=["org:resource", "recipients", "topic"] + ) def extract_ocel_outlook_calendar(email_user: Optional[str] = None, calendar_id: int = 9) -> OCEL: """ - Extracts the history of the calendar events (creation, update, start, end) + Extracts the history of calendar events (creation, update, start, end) as an object-centric event log from the local Outlook instance running on the current computer. - ACTIVITY (ocel:activity) => one between: Meeting Created, Last Change of Meeting, Meeting Started, Meeting Completed - TIMESTAMP (ocel:timestamp) => the timestamp of the event - - Object types: - - case:concept:name => identifier of the meeting - - case:subject => the subject of the meeting + Columns: + - **ACTIVITY (ocel:activity)**: One of the following activities: Meeting Created, Last Change of Meeting, Meeting Started, Meeting Completed. + - **TIMESTAMP (ocel:timestamp)**: Timestamp of the event. - :param email_user: (optional) e-mail address from which the (shared) calendar should be extracted - :param calendar_id: identifier of the calendar for the given user (default: 9) + Object Types: + - **case:concept:name**: Identifier of the meeting. + - **case:subject**: Subject of the meeting. + :param email_user: (optional) E-mail address from which the (shared) calendar should be extracted. + :param calendar_id: Identifier of the calendar for the given user (default: 9). :rtype: ``OCEL`` .. code-block:: python3 import pm4py + # Extract using default parameters ocel = pm4py.connectors.extract_ocel_outlook_calendar() + + # Extract using a specific email user ocel = pm4py.connectors.extract_ocel_outlook_calendar("vacation-calendar@workplace.eu") """ import pm4py dataframe = pm4py.connectors.extract_log_outlook_calendar(email_user, calendar_id) - return pm4py.convert_log_to_ocel(dataframe, "concept:name", "time:timestamp", ["case:concept:name", "case:subject"]) + return pm4py.convert_log_to_ocel( + dataframe, + case_id_col="concept:name", + timestamp_col="time:timestamp", + object_attributes=["case:concept:name", "case:subject"] + ) def extract_ocel_windows_events() -> OCEL: """ - Extract a process mining dataframe from all the events recorded in the Windows registry as an object-centric - event log. + Extracts an object-centric event log from all events recorded in the Windows registry. - ACTIVITY (concept:name) => concatenation of the source name of the event and the event identifier - (see https://learn.microsoft.com/en-us/previous-versions/windows/desktop/eventlogprov/win32-ntlogevent) - TIMESTAMP (time:timestamp) => timestamp of generation of the event + Columns: + - **ACTIVITY (ocel:activity)**: Concatenation of the source name of the event and the event identifier. + (See [Win32_NTLogEvent](https://learn.microsoft.com/en-us/previous-versions/windows/desktop/eventlogprov/win32-ntlogevent)) + - **TIMESTAMP (ocel:timestamp)**: Timestamp of event generation. - Object types: - - categoryString: translation of the subcategory. The translation is source-specific. - - computerName: name of the computer that generated this event. - - eventIdentifier: identifier of the event. This is specific to the source that generated the event log entry. - - eventType: 1=Error; 2=Warning; 3=Information; 4=Security Audit Success;5=Security Audit Failure; - - sourceName: name of the source (application, service, driver, or subsystem) that generated the entry. - - user: user name of the logged-on user when the event occurred. If the user name cannot be determined, this will be NULL. + Object Types: + - **categoryString**: Translation of the subcategory. The translation is source-specific. + - **computerName**: Name of the computer that generated the event. + - **eventIdentifier**: Identifier of the event, specific to the source that generated the event log entry. + - **eventType**: Event type classification (1=Error; 2=Warning; 3=Information; 4=Security Audit Success; 5=Security Audit Failure). + - **sourceName**: Name of the source (application, service, driver, or subsystem) that generated the entry. + - **user**: Username of the logged-on user when the event occurred. If the username cannot be determined, this will be NULL. :rtype: ``OCEL`` @@ -315,140 +340,208 @@ def extract_ocel_windows_events() -> OCEL: """ import pm4py dataframe = pm4py.connectors.extract_log_windows_events() - return pm4py.convert_log_to_ocel(dataframe, "concept:name", "time:timestamp", ["categoryString", "computerName", "eventIdentifier", "eventType", "sourceName", "user"]) + return pm4py.convert_log_to_ocel( + dataframe, + case_id_col="concept:name", + timestamp_col="time:timestamp", + object_attributes=["categoryString", "computerName", "eventIdentifier", "eventType", "sourceName", "user"] + ) def extract_ocel_chrome_history(history_db_path: Optional[str] = None) -> OCEL: """ Extracts an object-centric event log containing the navigation history of Google Chrome. - Please keep Google Chrome history closed when extracting. + Please ensure that Google Chrome history is closed when extracting. - ACTIVITY (ocel:activity) => the complete path of the website, minus the GET arguments - TIMESTAMP (ocel:timestamp) => the timestamp of visit + Columns: + - **ACTIVITY (ocel:activity)**: Complete path of the website, excluding GET arguments. + - **TIMESTAMP (ocel:timestamp)**: Timestamp of the visit. Object Types: - - case:concept:name : the profile of Chrome that is used to visit the site - - complete_url: the complete URL of the website - - url_wo_parameters: complete URL minus the part after ? - - domain: the domain of the website that is visited + - **case:concept:name**: Profile of Chrome used to visit the site. + - **complete_url**: Complete URL of the website. + - **url_wo_parameters**: Complete URL excluding the part after '?'. + - **domain**: Domain of the visited website. - :param history_db_path: path to the history DB path of Google Chrome (default: position of the Windows folder) + :param history_db_path: Path to the Google Chrome history database (default: location of the Windows folder). :rtype: ``OCEL`` .. code-block:: python3 import pm4py - dataframe = pm4py.connectors.extract_ocel_chrome_history() + ocel = pm4py.connectors.extract_ocel_chrome_history() """ import pm4py dataframe = pm4py.connectors.extract_log_chrome_history(history_db_path) - return pm4py.convert_log_to_ocel(dataframe, "concept:name", "time:timestamp", ["case:concept:name", "complete_url", "url_wo_parameters", "domain"]) + return pm4py.convert_log_to_ocel( + dataframe, + case_id_col="concept:name", + timestamp_col="time:timestamp", + object_attributes=["case:concept:name", "complete_url", "url_wo_parameters", "domain"] + ) def extract_ocel_firefox_history(history_db_path: Optional[str] = None) -> OCEL: """ Extracts an object-centric event log containing the navigation history of Mozilla Firefox. - Please keep Mozilla Firefox history closed when extracting. + Please ensure that Mozilla Firefox history is closed when extracting. - ACTIVITY (ocel:activity) => the complete path of the website, minus the GET arguments - TIMESTAMP (ocel:timestamp) => the timestamp of visit + Columns: + - **ACTIVITY (ocel:activity)**: Complete path of the website, excluding GET arguments. + - **TIMESTAMP (ocel:timestamp)**: Timestamp of the visit. Object Types: - - case:concept:name : the profile of Firefox that is used to visit the site - - complete_url: the complete URL of the website - - url_wo_parameters: complete URL minus the part after ? - - domain: the domain of the website that is visited + - **case:concept:name**: Profile of Firefox used to visit the site. + - **complete_url**: Complete URL of the website. + - **url_wo_parameters**: Complete URL excluding the part after '?'. + - **domain**: Domain of the visited website. - :param history_db_path: path to the history DB path of Mozilla Firefox (default: position of the Windows folder) + :param history_db_path: Path to the Mozilla Firefox history database (default: location of the Windows folder). :rtype: ``OCEL`` .. code-block:: python3 import pm4py - dataframe = pm4py.connectors.extract_ocel_firefox_history() + ocel = pm4py.connectors.extract_ocel_firefox_history() """ import pm4py dataframe = pm4py.connectors.extract_log_firefox_history(history_db_path) - return pm4py.convert_log_to_ocel(dataframe, "concept:name", "time:timestamp", ["case:concept:name", "complete_url", "url_wo_parameters", "domain"]) + return pm4py.convert_log_to_ocel( + dataframe, + case_id_col="concept:name", + timestamp_col="time:timestamp", + object_attributes=["case:concept:name", "complete_url", "url_wo_parameters", "domain"] + ) def extract_ocel_github(owner: str = "pm4py", repo: str = "pm4py-core", auth_token: Optional[str] = None) -> OCEL: """ - Extracts a dataframe containing the history of the issues of a Github repository. - According to the API limit rate of public/registered users, only a part of the events - can be returned. + Extracts an object-centric event log containing the history of issues from a GitHub repository. + Due to API rate limits for public and registered users, only a subset of events may be returned. - ACTIVITY (ocel:activity) => the event (created, commented, closed, subscribed ...) - TIMESTAMP (ocel:timestamp) => the timestamp of execution of the event + Columns: + - **ACTIVITY (ocel:activity)**: The event type (e.g., created, commented, closed, subscribed). + - **TIMESTAMP (ocel:timestamp)**: Timestamp of the event execution. - Object types: - - case:concept:name => the URL of the events related to the issue - - org:resource => the involved resource - - case:repo => the repository in which the issue is created + Object Types: + - **case:concept:name**: URL of the events related to the issue. + - **org:resource**: Involved resource. + - **case:repo**: Repository in which the issue was created. - :param owner: owner of the repository (e.g., pm4py) - :param repo: name of the repository (e.g., pm4py-core) - :param auth_token: authorization token + :param owner: Owner of the repository (e.g., pm4py). + :param repo: Name of the repository (e.g., pm4py-core). + :param auth_token: Authorization token. :rtype: ``OCEL`` .. code-block:: python3 import pm4py - dataframe = pm4py.connectors.extract_ocel_github(owner='pm4py', repo='pm4py-core') + ocel = pm4py.connectors.extract_ocel_github(owner='pm4py', repo='pm4py-core') """ import pm4py dataframe = pm4py.connectors.extract_log_github(owner, repo, auth_token) - return pm4py.convert_log_to_ocel(dataframe, "concept:name", "time:timestamp", ["case:concept:name", "org:resource", "case:repo"]) + return pm4py.convert_log_to_ocel( + dataframe, + case_id_col="concept:name", + timestamp_col="time:timestamp", + object_attributes=["case:concept:name", "org:resource", "case:repo"] + ) def extract_ocel_camunda_workflow(connection_string: str) -> OCEL: """ Extracts an object-centric event log from the Camunda workflow system. - :param connection_string: ODBC connection string to the Camunda database - :rtype: ``pd.DataFrame`` + Columns: + - **ACTIVITY (ocel:activity)**: Activity performed within Camunda. + - **TIMESTAMP (ocel:timestamp)**: Timestamp of the activity execution. + + Object Types: + - **case:concept:name**: Identifier of the case. + - **processID**: Process ID within Camunda. + - **org:resource**: Resource involved in the activity. + + :param connection_string: ODBC connection string to the Camunda database. + :rtype: ``OCEL`` .. code-block:: python3 import pm4py - ocel = pm4py.connectors.extract_ocel_camunda_workflow('Driver={PostgreSQL Unicode(x64)};SERVER=127.0.0.3;DATABASE=process-engine;UID=xx;PWD=yy') + ocel = pm4py.connectors.extract_ocel_camunda_workflow( + 'Driver={PostgreSQL Unicode(x64)};SERVER=127.0.0.3;DATABASE=process-engine;UID=xx;PWD=yy' + ) """ import pm4py dataframe = pm4py.connectors.extract_log_camunda_workflow(connection_string) - return pm4py.convert_log_to_ocel(dataframe, "concept:name", "time:timestamp", ["case:concept:name", "processID", "org:resource"]) + return pm4py.convert_log_to_ocel( + dataframe, + case_id_col="concept:name", + timestamp_col="time:timestamp", + object_attributes=["case:concept:name", "processID", "org:resource"] + ) def extract_ocel_sap_o2c(connection_string: str, prefix: str = '') -> OCEL: """ - Extracts an object-centric event log for the SAP O2C process. + Extracts an object-centric event log for the SAP Order-to-Cash (O2C) process. - :param connection_string: ODBC connection string to the SAP database - :param prefix: prefix for the tables (example: SAPSR3.) - :rtype: ``pd.DataFrame`` + Columns: + - **ACTIVITY (ocel:activity)**: Activity performed in the O2C process. + - **TIMESTAMP (ocel:timestamp)**: Timestamp of the activity execution. + + Object Types: + - **case:concept:name**: Identifier of the case. + - **org:resource**: Resource involved in the activity. + + :param connection_string: ODBC connection string to the SAP database. + :param prefix: Prefix for the tables (e.g., SAPSR3.). + :rtype: ``OCEL`` .. code-block:: python3 import pm4py - dataframe = pm4py.connectors.extract_ocel_sap_o2c('Driver={Oracle in instantclient_21_6};DBQ=127.0.0.3:1521/ZIB;UID=xx;PWD=yy') + ocel = pm4py.connectors.extract_ocel_sap_o2c( + 'Driver={Oracle in instantclient_21_6};DBQ=127.0.0.3:1521/ZIB;UID=xx;PWD=yy' + ) """ import pm4py dataframe = pm4py.connectors.extract_log_sap_o2c(connection_string, prefix=prefix) - return pm4py.convert_log_to_ocel(dataframe, "concept:name", "time:timestamp", ["case:concept:name", "org:resource"]) + return pm4py.convert_log_to_ocel( + dataframe, + case_id_col="concept:name", + timestamp_col="time:timestamp", + object_attributes=["case:concept:name", "org:resource"] + ) def extract_ocel_sap_accounting(connection_string: str, prefix: str = '') -> OCEL: """ Extracts an object-centric event log for the SAP Accounting process. - :param connection_string: ODBC connection string to the SAP database - :param prefix: prefix for the tables (example: SAPSR3.) - :rtype: ``pd.DataFrame`` + Columns: + - **ACTIVITY (ocel:activity)**: Activity performed in the Accounting process. + - **TIMESTAMP (ocel:timestamp)**: Timestamp of the activity execution. + + Object Types: + - **case:concept:name**: Identifier of the case. + - **org:resource**: Resource involved in the activity. + + :param connection_string: ODBC connection string to the SAP database. + :param prefix: Prefix for the tables (e.g., SAPSR3.). + :rtype: ``OCEL`` .. code-block:: python3 import pm4py - dataframe = pm4py.connectors.extract_ocel_sap_accounting('Driver={Oracle in instantclient_21_6};DBQ=127.0.0.3:1521/ZIB;UID=xx;PWD=yy') + ocel = pm4py.connectors.extract_ocel_sap_accounting( + 'Driver={Oracle in instantclient_21_6};DBQ=127.0.0.3:1521/ZIB;UID=xx;PWD=yy' + ) """ import pm4py dataframe = pm4py.connectors.extract_log_sap_accounting(connection_string, prefix=prefix) - return pm4py.convert_log_to_ocel(dataframe, "concept:name", "time:timestamp", ["case:concept:name", "org:resource"]) + return pm4py.convert_log_to_ocel( + dataframe, + case_id_col="concept:name", + timestamp_col="time:timestamp", + object_attributes=["case:concept:name", "org:resource"] + ) diff --git a/pm4py/convert.py b/pm4py/convert.py index ab2a98599..31c78db15 100644 --- a/pm4py/convert.py +++ b/pm4py/convert.py @@ -45,12 +45,13 @@ def convert_to_event_log(obj: Union[pd.DataFrame, EventStream], case_id_key: str = "case:concept:name", **kwargs) -> EventLog: """ - Converts a DataFrame/EventStream object to an event log object + Converts a DataFrame or EventStream object to an event log object. + + :param obj: The DataFrame or EventStream object to convert. + :param case_id_key: The attribute to be used as the case identifier. Defaults to "case:concept:name". + :param kwargs: Additional keyword arguments to pass to the converter. + :return: An ``EventLog`` object. - :param obj: DataFrame or EventStream object - :param case_id_key: attribute to be used as case identifier - :rtype: ``EventLog`` - .. code-block:: python3 import pandas as pd @@ -78,19 +79,19 @@ def convert_to_event_log(obj: Union[pd.DataFrame, EventStream], case_id_key: str def convert_to_event_stream(obj: Union[EventLog, pd.DataFrame], case_id_key: str = "case:concept:name", **kwargs) -> EventStream: """ - Converts a log object to an event stream + Converts a log object or DataFrame to an event stream. + + :param obj: The log object (``EventLog``) or DataFrame to convert. + :param case_id_key: The attribute to be used as the case identifier. Defaults to "case:concept:name". + :param kwargs: Additional keyword arguments to pass to the converter. + :return: An ``EventStream`` object. - :param obj: log object - :param case_id_key: attribute to be used as case identifier - :rtype: ``EventStream`` - .. code-block:: python3 import pm4py log = pm4py.read_xes("tests/input_data/running-example.xes") event_stream = pm4py.convert_to_event_stream(log) - """ if check_is_pandas_dataframe(obj): check_pandas_dataframe_columns(obj, case_id_key=case_id_key) @@ -109,11 +110,12 @@ def convert_to_event_stream(obj: Union[EventLog, pd.DataFrame], case_id_key: str def convert_to_dataframe(obj: Union[EventStream, EventLog], **kwargs) -> pd.DataFrame: """ - Converts a log object to a dataframe + Converts a log object (``EventStream`` or ``EventLog``) to a Pandas DataFrame. + + :param obj: The log object to convert. + :param kwargs: Additional keyword arguments to pass to the converter. + :return: A ``pd.DataFrame`` object. - :param obj: log object - :rtype: ``pd.DataFrame`` - .. code-block:: python3 import pm4py @@ -127,7 +129,7 @@ def convert_to_dataframe(obj: Union[EventStream, EventLog], **kwargs) -> pd.Data parameters = get_properties(obj) for k, v in kwargs.items(): parameters[k] = v - + from pm4py.objects.conversion.log import converter df = converter.apply(obj, variant=converter.Variants.TO_DATA_FRAME, parameters=parameters) return df @@ -136,18 +138,21 @@ def convert_to_dataframe(obj: Union[EventStream, EventLog], **kwargs) -> pd.Data def convert_to_bpmn(*args: Union[Tuple[PetriNet, Marking, Marking], ProcessTree]) -> BPMN: """ Converts an object to a BPMN diagram. - As an input, either a Petri net (with corresponding initial and final marking) or a process tree can be provided. - A process tree can always be converted into a BPMN model and thus quality of the result object is guaranteed. - For Petri nets, the quality of the converison largely depends on the net provided (e.g., sound WF-nets are likely to produce reasonable BPMN models) - :param args: petri net (with initial and final marking) or process tree - :rtype: ``BPMN`` - + As input, either a Petri net (with corresponding initial and final markings) or a process tree can be provided. + A process tree can always be converted into a BPMN model, ensuring the quality of the resulting object. + For Petri nets, the quality of the conversion largely depends on the net provided (e.g., sound WF-nets are likely to produce reasonable BPMN models). + + :param args: + - If converting a Petri net: a tuple of (``PetriNet``, ``Marking``, ``Marking``). + - If converting a process tree: a single ``ProcessTree`` object. + :return: A ``BPMN`` object. + .. code-block:: python3 import pm4py - # import a Petri net from a file + # Import a Petri net from a file net, im, fm = pm4py.read_pnml("tests/input_data/running-example.pnml") bpmn_graph = pm4py.convert_to_bpmn(net, im, fm) """ @@ -171,23 +176,27 @@ def convert_to_bpmn(*args: Union[Tuple[PetriNet, Marking, Marking], ProcessTree] # don't do nothing and throw the following exception pass # if no conversion is done, then the format of the arguments is unsupported - raise Exception("unsupported conversion of the provided object to BPMN") + raise Exception("Unsupported conversion of the provided object to BPMN") def convert_to_petri_net(*args: Union[BPMN, ProcessTree, HeuristicsNet, POWL, dict]) -> Tuple[PetriNet, Marking, Marking]: """ Converts an input model to an (accepting) Petri net. - The input objects can either be a process tree, BPMN model or a Heuristic net. - The output is a triple, containing the Petri net and the initial and final markings. The markings are only returned if they can be reasonable derived from the input model. - :param args: process tree, Heuristics net, BPMN or POWL model - :rtype: ``Tuple[PetriNet, Marking, Marking]`` - + The input objects can be a process tree, BPMN model, Heuristic net, POWL model, or a dictionary representing a Directly-Follows Graph (DFG). + The output is a tuple containing the Petri net and the initial and final markings. + The markings are only returned if they can be reasonably derived from the input model. + + :param args: + - If converting from a BPMN, ProcessTree, HeuristicsNet, or POWL: a single object of the respective type. + - If converting from a DFG: a dictionary representing the DFG, followed by lists of start and end activities. + :return: A tuple of (``PetriNet``, ``Marking``, ``Marking``). + .. code-block:: python3 import pm4py - # imports a process tree from a PTML file + # Imports a process tree from a PTML file process_tree = pm4py.read_ptml("tests/input_data/running-example.ptml") net, im, fm = pm4py.convert_to_petri_net(process_tree) """ @@ -209,29 +218,36 @@ def convert_to_petri_net(*args: Union[BPMN, ProcessTree, HeuristicsNet, POWL, di elif isinstance(args[0], dict): # DFG from pm4py.objects.conversion.dfg.variants import to_petri_net_activity_defines_place - return to_petri_net_activity_defines_place.apply(args[0], parameters={ - to_petri_net_activity_defines_place.Parameters.START_ACTIVITIES: args[1], - to_petri_net_activity_defines_place.Parameters.END_ACTIVITIES: args[2]}) + return to_petri_net_activity_defines_place.apply( + args[0], + parameters={ + to_petri_net_activity_defines_place.Parameters.START_ACTIVITIES: args[1], + to_petri_net_activity_defines_place.Parameters.END_ACTIVITIES: args[2] + } + ) # if no conversion is done, then the format of the arguments is unsupported - raise Exception("unsupported conversion of the provided object to Petri net") + raise Exception("Unsupported conversion of the provided object to Petri net") -def convert_to_process_tree(*args: Union[Tuple[PetriNet, Marking, Marking], BPMN]) -> ProcessTree: +def convert_to_process_tree(*args: Union[Tuple[PetriNet, Marking, Marking], BPMN, ProcessTree]) -> ProcessTree: """ Converts an input model to a process tree. - The input models can either be Petri nets (marked) or BPMN models. - For both input types, the conversion is not guaranteed to work, hence, invocation of the method can yield an Exception. - :param args: petri net (along with initial and final marking) or BPMN - :rtype: ``ProcessTree`` - + The input models can be Petri nets (with markings) or BPMN models. + For both input types, the conversion is not guaranteed to work and may raise an exception. + + :param args: + - If converting from a Petri net: a tuple of (``PetriNet``, ``Marking``, ``Marking``). + - If converting from a BPMN or ProcessTree: a single object of the respective type. + :return: A ``ProcessTree`` object. + .. code-block:: python3 import pm4py - # imports a BPMN file + # Imports a BPMN file bpmn_graph = pm4py.read_bpmn("tests/input_data/running-example.bpmn") - # converts the BPMN to a process tree (through intermediate conversion to a Petri net) + # Converts the BPMN to a process tree (through intermediate conversion to a Petri net) process_tree = pm4py.convert_to_process_tree(bpmn_graph) """ from pm4py.objects.process_tree.obj import ProcessTree @@ -250,25 +266,28 @@ def convert_to_process_tree(*args: Union[Tuple[PetriNet, Marking, Marking], BPMN if tree is not None: return tree - raise Exception("the object represents a model that cannot be represented as a process tree!") + raise Exception("The object represents a model that cannot be represented as a process tree!") def convert_to_reachability_graph(*args: Union[Tuple[PetriNet, Marking, Marking], BPMN, ProcessTree]) -> TransitionSystem: """ Converts an input model to a reachability graph (transition system). - The input models can either be Petri nets (with markings), BPMN models or process trees. - The output is the state-space of the model (i.e., the reachability graph), enocdoed as a ``TransitionSystem`` object. - :param args: petri net (along with initial and final marking), process tree or BPMN - :rtype: ``TransitionSystem`` - + The input models can be Petri nets (with markings), BPMN models, or process trees. + The output is the state-space of the model, encoded as a ``TransitionSystem`` object. + + :param args: + - If converting from a Petri net: a tuple of (``PetriNet``, ``Marking``, ``Marking``). + - If converting from a BPMN or ProcessTree: a single object of the respective type. + :return: A ``TransitionSystem`` object. + .. code-block:: python3 import pm4py - # reads a Petri net from a file + # Reads a Petri net from a file net, im, fm = pm4py.read_pnml("tests/input_data/running-example.pnml") - # converts it to reachability graph + # Converts it to a reachability graph reach_graph = pm4py.convert_to_reachability_graph(net, im, fm) """ if isinstance(args[0], PetriNet): @@ -280,25 +299,36 @@ def convert_to_reachability_graph(*args: Union[Tuple[PetriNet, Marking, Marking] return reachability_graph.construct_reachability_graph(net, im) -def convert_log_to_ocel(log: Union[EventLog, EventStream, pd.DataFrame], activity_column: str = "concept:name", timestamp_column: str = "time:timestamp", object_types: Optional[Collection[str]] = None, obj_separator: str = " AND ", additional_event_attributes: Optional[Collection[str]] = None, additional_object_attributes: Optional[Dict[str, Collection[str]]] = None) -> OCEL: +def convert_log_to_ocel( + log: Union[EventLog, EventStream, pd.DataFrame], + activity_column: str = "concept:name", + timestamp_column: str = "time:timestamp", + object_types: Optional[Collection[str]] = None, + obj_separator: str = " AND ", + additional_event_attributes: Optional[Collection[str]] = None, + additional_object_attributes: Optional[Dict[str, Collection[str]]] = None +) -> OCEL: """ - Converts an event log to an object-centric event log with one or more than one - object types. - - :param log_obj: log object - :param activity_column: activity column - :param timestamp_column: timestamp column - :param object_types: list of columns to consider as object types - :param obj_separator: separator between different objects in the same column - :param additional_event_attributes: additional attributes to be considered as event attributes in the OCEL - :param additional_object_attributes: additional attributes per object type to be considered as object attributes in the OCEL (dictionary in which object types are associated to their attributes, i.e., {"order": ["quantity", "cost"], "invoice": ["date", "due date"]}) - :rtype: ``OCEL`` + Converts an event log to an object-centric event log (OCEL) with one or more object types. + + :param log: The log object to convert. + :param activity_column: The name of the column representing activities. + :param timestamp_column: The name of the column representing timestamps. + :param object_types: A collection of column names to consider as object types. If None, defaults are used. + :param obj_separator: The separator used between different objects in the same column. Defaults to " AND ". + :param additional_event_attributes: Additional attribute names to include as event attributes in the OCEL. + :param additional_object_attributes: Additional attributes per object type to include as object attributes in the OCEL. Should be a dictionary mapping object types to lists of attribute names. + :return: An ``OCEL`` object. .. code-block:: python3 import pm4py - ocel = pm4py.convert_log_to_ocel(log, activity_column='concept:name', timestamp_column='time:timestamp', - object_types=['case:concept:name']) + ocel = pm4py.convert_log_to_ocel( + log, + activity_column='concept:name', + timestamp_column='time:timestamp', + object_types=['case:concept:name'] + ) """ __event_log_deprecation_warning(log) @@ -309,16 +339,27 @@ def convert_log_to_ocel(log: Union[EventLog, EventStream, pd.DataFrame], activit object_types = list(set(x for x in log.columns if x == "case:concept:name" or x.startswith("ocel:type"))) from pm4py.objects.ocel.util import log_ocel - return log_ocel.log_to_ocel_multiple_obj_types(log, activity_column, timestamp_column, object_types, obj_separator, additional_event_attributes=additional_event_attributes, additional_object_attributes=additional_object_attributes) + return log_ocel.log_to_ocel_multiple_obj_types( + log, + activity_column, + timestamp_column, + object_types, + obj_separator, + additional_event_attributes=additional_event_attributes, + additional_object_attributes=additional_object_attributes + ) def convert_ocel_to_networkx(ocel: OCEL, variant: str = "ocel_to_nx") -> nx.DiGraph: """ Converts an OCEL to a NetworkX DiGraph object. - :param ocel: object-centric event log - :param variant: variant of the conversion to use: "ocel_to_nx" -> graph containing event and object IDS and two type of relations (REL=related objects, DF=directly-follows); "ocel_features_to_nx" -> graph containing different types of interconnection at the object level - :rtype: ``nx.DiGraph`` + :param ocel: The object-centric event log to convert. + :param variant: The variant of the conversion to use. + Options: + - "ocel_to_nx": Graph containing event and object IDs and two types of relations (REL=related objects, DF=directly-follows). + - "ocel_features_to_nx": Graph containing different types of interconnections at the object level. + :return: A ``nx.DiGraph`` object representing the OCEL. .. code-block:: python3 import pm4py @@ -332,53 +373,79 @@ def convert_ocel_to_networkx(ocel: OCEL, variant: str = "ocel_to_nx") -> nx.DiGr variant1 = converter.Variants.OCEL_TO_NX elif variant == "ocel_features_to_nx": variant1 = converter.Variants.OCEL_FEATURES_TO_NX + else: + raise ValueError(f"Unsupported variant '{variant}'. Supported variants are 'ocel_to_nx' and 'ocel_features_to_nx'.") return converter.apply(ocel, variant=variant1) -def convert_log_to_networkx(log: Union[EventLog, EventStream, pd.DataFrame], include_df: bool = True, case_id_key: str = "concept:name", other_case_attributes_as_nodes: Optional[Collection[str]] = None, event_attributes_as_nodes: Optional[Collection[str]] = None) -> nx.DiGraph: +def convert_log_to_networkx( + log: Union[EventLog, EventStream, pd.DataFrame], + include_df: bool = True, + case_id_key: str = "concept:name", + other_case_attributes_as_nodes: Optional[Collection[str]] = None, + event_attributes_as_nodes: Optional[Collection[str]] = None +) -> nx.DiGraph: """ - Converts an event log object to a NetworkX DiGraph object. - The nodes of the graph are the events, the cases (and possibly the attributes of the log). - The edges are: - - Connecting each event to the corresponding case (BELONGS_TO type) - - Connecting every event to the directly-following one (DF type, if enabled) - - Connecting every case/event to the given attribute values (ATTRIBUTE_EDGE type) - - :param log: log object (EventLog, EventStream, Pandas dataframe) - :param include_df: include the directly-follows graph relation in the graph (bool) - :param case_id_attribute: specify which attribute at the case level should be considered the case ID (str) - :param other_case_attributes_as_nodes: specify which attributes at the case level should be inserted in the graph as nodes (other than the caseID) (list, default empty) - :param event_attributes_as_nodes: specify which attributes at the event level should be inserted in the graph as nodes (list, default empty) - :rtype: ``nx.DiGraph`` + Converts an event log to a NetworkX DiGraph object. + + The nodes of the graph include events, cases, and optionally log attributes. + The edges represent: + - BELONGS_TO: Connecting each event to its corresponding case. + - DF: Connecting events that directly follow each other (if enabled). + - ATTRIBUTE_EDGE: Connecting cases/events to their attribute values. + + :param log: The log object to convert (``EventLog``, ``EventStream``, or Pandas DataFrame). + :param include_df: Whether to include the directly-follows relation in the graph. Defaults to True. + :param case_id_key: The attribute to be used as the case identifier. Defaults to "concept:name". + :param other_case_attributes_as_nodes: Attributes at the case level to include as nodes, excluding the case ID. + :param event_attributes_as_nodes: Attributes at the event level to include as nodes. + :return: A ``nx.DiGraph`` object representing the event log. .. code-block:: python3 import pm4py - nx_digraph = pm4py.convert_log_to_networkx(log, other_case_attributes_as_nodes=['responsible', 'department'], event_attributes_as_nodes=['concept:name', 'org:resource']) + nx_digraph = pm4py.convert_log_to_networkx( + log, + other_case_attributes_as_nodes=['responsible', 'department'], + event_attributes_as_nodes=['concept:name', 'org:resource'] + ) """ from pm4py.objects.conversion.log import converter - return converter.apply(log, variant=converter.Variants.TO_NX, parameters={"include_df": include_df, "case_id_attribute": case_id_key, "other_case_attributes_as_nodes": other_case_attributes_as_nodes, "event_attributes_as_nodes": event_attributes_as_nodes}) + return converter.apply( + log, + variant=converter.Variants.TO_NX, + parameters={ + "include_df": include_df, + "case_id_attribute": case_id_key, + "other_case_attributes_as_nodes": other_case_attributes_as_nodes, + "event_attributes_as_nodes": event_attributes_as_nodes + } + ) + + +def convert_log_to_time_intervals( + log: Union[EventLog, pd.DataFrame], + filter_activity_couple: Optional[Tuple[str, str]] = None, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + start_timestamp_key: str = "time:timestamp" +) -> List[List[Any]]: + """ + Extracts a list of time intervals from an event log. + Each interval contains two temporally consecutive events within the same case and measures the time between them + (complete timestamp of the first event against the start timestamp of the second event). -def convert_log_to_time_intervals(log: Union[EventLog, pd.DataFrame], filter_activity_couple: Optional[Tuple[str, str]] = None, - activity_key: str = "concept:name", - timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", - start_timestamp_key: str = "time:timestamp" - ) -> List[List[Any]]: - """ - Gets a list of intervals from an event log. - Each interval contains two temporally consecutive events and measures the time between the two events - (complete timestamp of the first against start timestamp of the second). - - :param log: log object - :param filter_activity_couple: (optional) filters the intervals to only consider a given couple of activities of the log - :param activity_key: the attribute to be used as activity - :param timestamp_key: the attribute to be used as timestamp - :param case_id_key: the attribute to be used as case identifier - :param start_timestamp_key: the attribute to be used as start timestamp - :rtype: ``List[List[Any]]`` + :param log: The log object to convert. + :param filter_activity_couple: Optional tuple to filter intervals by a specific pair of activities. + :param activity_key: The attribute to be used as the activity identifier. Defaults to "concept:name". + :param timestamp_key: The attribute to be used as the timestamp. Defaults to "time:timestamp". + :param case_id_key: The attribute to be used as the case identifier. Defaults to "case:concept:name". + :param start_timestamp_key: The attribute to be used as the start timestamp in the interval. Defaults to "time:timestamp". + :return: A list of intervals, where each interval is a list containing relevant information about the time gap. .. code-block:: python3 @@ -387,12 +454,20 @@ def convert_log_to_time_intervals(log: Union[EventLog, pd.DataFrame], filter_act log = pm4py.read_xes('tests/input_data/receipt.xes') time_intervals = pm4py.convert_log_to_time_intervals(log) print(len(time_intervals)) - time_intervals = pm4py.convert_log_to_time_intervals(log, ('Confirmation of receipt', 'T02 Check confirmation of receipt')) + time_intervals = pm4py.convert_log_to_time_intervals( + log, + filter_activity_couple=('Confirmation of receipt', 'T02 Check confirmation of receipt') + ) print(len(time_intervals)) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) + properties = get_properties( + log, + activity_key=activity_key, + case_id_key=case_id_key, + timestamp_key=timestamp_key + ) properties["filter_activity_couple"] = filter_activity_couple properties[constants.PARAMETER_CONSTANT_START_TIMESTAMP_KEY] = start_timestamp_key @@ -403,38 +478,68 @@ def convert_log_to_time_intervals(log: Union[EventLog, pd.DataFrame], filter_act def convert_petri_net_to_networkx(net: PetriNet, im: Marking, fm: Marking) -> nx.DiGraph: """ Converts a Petri net to a NetworkX DiGraph. - Each place and transition is corresponding to a node in the graph. - :param net: Petri net - :param im: initial marking - :param fm: final marking - :rtype: ``nx.DiGraph`` + Each place and transition in the Petri net is represented as a node in the graph. + + :param net: The Petri net to convert. + :param im: The initial marking of the Petri net. + :param fm: The final marking of the Petri net. + :return: A ``nx.DiGraph`` object representing the Petri net. .. code-block:: python3 import pm4py net, im, fm = pm4py.read_pnml('tests/input_data/running-example.pnml') - nx_digraph = pm4py.convert_petri_to_networkx(net, im, fm) + nx_digraph = pm4py.convert_petri_net_to_networkx(net, im, fm) """ G = nx_utils.DiGraph() for place in net.places: - G.add_node(place.name, attr={"name": place.name, "is_in_im": place in im, "is_in_fm": place in fm, "type": "place"}) + G.add_node( + place.name, + attr={ + "name": place.name, + "is_in_im": place in im, + "is_in_fm": place in fm, + "type": "place" + } + ) for trans in net.transitions: - G.add_node(trans.name, attr={"name": trans.name, "label": trans.label, "type": "transition"}) + G.add_node( + trans.name, + attr={ + "name": trans.name, + "label": trans.label, + "type": "transition" + } + ) for arc in net.arcs: - G.add_edge(arc.source.name, arc.target.name, attr={"weight": arc.weight, "properties": arc.properties}) + G.add_edge( + arc.source.name, + arc.target.name, + attr={ + "weight": arc.weight, + "properties": arc.properties + } + ) return G -def convert_petri_net_type(net: PetriNet, im: Marking, fm: Marking, type: str = "classic") -> Tuple[PetriNet, Marking, Marking]: +def convert_petri_net_type( + net: PetriNet, + im: Marking, + fm: Marking, + type: str = "classic" +) -> Tuple[PetriNet, Marking, Marking]: """ - Changes the Petri net (internal) type + Changes the internal type of a Petri net. + + Supports conversion to different Petri net types such as classic, reset, inhibitor, and reset_inhibitor nets. - :param net: petri net - :param im: initial marking - :param fm: final marking - :param type: internal type (classic, reset, inhibitor, reset_inhibitor) - :rtype: ``Tuple[PetriNet, Marking, Marking]`` + :param net: The Petri net to convert. + :param im: The initial marking of the Petri net. + :param fm: The final marking of the Petri net. + :param type: The target Petri net type. Options are "classic", "reset", "inhibitor", "reset_inhibitor". Defaults to "classic". + :return: A tuple of the converted (``PetriNet``, ``Marking``, ``Marking``). .. code-block:: python3 import pm4py @@ -458,6 +563,9 @@ def convert_petri_net_type(net: PetriNet, im: Marking, fm: Marking, type: str = elif type == "reset_inhibitor": from pm4py.objects.petri_net.obj import ResetInhibitorNet new_net = ResetInhibitorNet(net.name) + else: + raise ValueError(f"Unsupported Petri net type '{type}'. Supported types are 'classic', 'reset', 'inhibitor', 'reset_inhibitor'.") + for place in net.places: new_net.places.add(place) in_arcs = set(place.in_arcs) @@ -476,5 +584,11 @@ def convert_petri_net_type(net: PetriNet, im: Marking, fm: Marking, type: str = trans.out_arcs.remove(arc) for arc in net.arcs: arc_type = arc.properties["arctype"] if "arctype" in arc.properties else None - new_arc = petri_utils.add_arc_from_to(arc.source, arc.target, new_net, weight=arc.weight, type=arc_type) + new_arc = petri_utils.add_arc_from_to( + arc.source, + arc.target, + new_net, + weight=arc.weight, + type=arc_type + ) return new_net, im, fm diff --git a/pm4py/discovery.py b/pm4py/discovery.py index 1515164ed..b7bb67bad 100644 --- a/pm4py/discovery.py +++ b/pm4py/discovery.py @@ -20,7 +20,7 @@ Contact: info@processintelligence.solutions ''' __doc__ = """ -The ``pm4py.discovery`` module contains the process discovery algorithms implemented in ``pm4py`` +The ``pm4py.discovery`` module contains the process discovery algorithms implemented in ``pm4py``. """ from typing import Tuple, Union, List, Dict, Any, Optional, Set @@ -50,20 +50,28 @@ def discover_dfg(log: Union[EventLog, pd.DataFrame], activity_key: str = "concep """ Discovers a Directly-Follows Graph (DFG) from a log. - This method returns a dictionary with the couples of directly-following activities (in the log) - as keys and the frequency of relation as value. + This method returns a tuple containing: + - A dictionary with pairs of directly-following activities as keys and the frequency of the relationship as values. + - A dictionary of start activities with their respective frequencies. + - A dictionary of end activities with their respective frequencies. - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A tuple of three dictionaries: (dfg, start_activities, end_activities). :rtype: ``Tuple[dict, dict, dict]`` .. code-block:: python3 import pm4py - dfg, start_activities, end_activities = pm4py.discover_dfg(dataframe, case_id_key='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp') + dfg, start_activities, end_activities = pm4py.discover_dfg( + dataframe, + case_id_key='case:concept:name', + activity_key='concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -102,28 +110,33 @@ def discover_directly_follows_graph(log: Union[EventLog, pd.DataFrame], activity def discover_dfg_typed(log: pd.DataFrame, case_id_key: str = "case:concept:name", activity_key: str = "concept:name", timestamp_key: str = "time:timestamp") -> DFG: """ - Discovers a Directly-Follows Graph (DFG) from a log. + Discovers a typed Directly-Follows Graph (DFG) from a log. - This method returns a typed DFG object, i.e., as specified in ``pm4py.objects.dfg.obj.py`` (``DirectlyFollowsGraph`` Class) - The DFG object describes a graph, start activities and end activities. - The graph is a collection of triples of the form (a,b,f) representing an arc a->b with frequency f. - The start activities are a collection of tuples of the form (a,f) representing that activity a starts f cases. - The end activities are a collection of tuples of the form (a,f) representing that ativity a ends f cases. + This method returns a typed DFG object, as specified in ``pm4py.objects.dfg.obj.py`` (``DirectlyFollowsGraph`` Class). + The DFG object includes the graph, start activities, and end activities. + - The graph is a collection of triples of the form (a, b, f) representing an arc a->b with frequency f. + - The start activities are a collection of tuples of the form (a, f) representing that activity a starts f cases. + - The end activities are a collection of tuples of the form (a, f) representing that activity a ends f cases. - This method replaces ``pm4py.discover_dfg`` and ``pm4py.discover_directly_follows_graph``. In a future release, these functions will adopt the same behavior as this function. + This method replaces ``pm4py.discover_dfg`` and ``pm4py.discover_directly_follows_graph``. In future releases, these functions will adopt the same behavior as this function. :param log: ``pandas.DataFrame`` - :param case_id_key: attribute to be used as case identifier - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :return: A typed DFG object containing the graph, start activities, and end activities. :rtype: ``DFG`` .. code-block:: python3 import pm4py - dfg = pm4py.discover_dfg_typed(log, case_id_key='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp') + dfg = pm4py.discover_dfg_typed( + log, + case_id_key='case:concept:name', + activity_key='concept:name', + timestamp_key='time:timestamp' + ) """ from pm4py.algo.discovery.dfg.variants import clean parameters = get_properties( @@ -131,36 +144,51 @@ def discover_dfg_typed(log: pd.DataFrame, case_id_key: str = "case:concept:name" if importlib.util.find_spec("polars"): import polars as pl - if type(log) is pl.DataFrame: + if isinstance(log, pl.DataFrame): from pm4py.algo.discovery.dfg.variants import clean_polars return clean_polars.apply(log, parameters) if pandas_utils.check_is_pandas_dataframe(log): return clean.apply(log, parameters) else: - raise TypeError('pm4py.discover_dfg_typed is only defined for dataFrames') - + raise TypeError('pm4py.discover_dfg_typed is only defined for DataFrames') + def discover_performance_dfg(log: Union[EventLog, pd.DataFrame], business_hours: bool = False, business_hour_slots=constants.DEFAULT_BUSINESS_HOUR_SLOTS, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Tuple[dict, dict, dict]: """ - Discovers a performance directly-follows graph from an event log. - - This method returns a dictionary with the couples of directly-following activities (in the log) - as keys and the performance of relation as value. - - :param log: event log / Pandas dataframe - :param business_hours: enables/disables the computation based on the business hours (default: False) - :param business_hour_slots: work schedule of the company, provided as a list of tuples where each tuple represents one time slot of business hours. One slot i.e. one tuple consists of one start and one end time given in seconds since week start, e.g. [(7 * 60 * 60, 17 * 60 * 60), ((24 + 7) * 60 * 60, (24 + 12) * 60 * 60), ((24 + 13) * 60 * 60, (24 + 17) * 60 * 60),] meaning that business hours are Mondays 07:00 - 17:00 and Tuesdays 07:00 - 12:00 and 13:00 - 17:00 - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + Discovers a Performance Directly-Follows Graph from an event log. + + This method returns a tuple containing: + - A dictionary with pairs of directly-following activities as keys and the performance metrics of the relationship as values. + - A dictionary of start activities with their respective frequencies. + - A dictionary of end activities with their respective frequencies. + + :param log: Event log or Pandas DataFrame. + :param business_hours: Enables or disables computation based on business hours (default: False). + :param business_hour_slots: Work schedule of the company, provided as a list of tuples where each tuple represents one time slot of business hours. Each slot consists of a start and end time given in seconds since the week start. Example: + ```python + [ + (7 * 60 * 60, 17 * 60 * 60), # Monday 07:00 - 17:00 + ((24 + 7) * 60 * 60, (24 + 12) * 60 * 60), # Tuesday 07:00 - 12:00 + ((24 + 13) * 60 * 60, (24 + 17) * 60 * 60) # Tuesday 13:00 - 17:00 + ] + ``` + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A tuple of three dictionaries: (performance_dfg, start_activities, end_activities). :rtype: ``Tuple[dict, dict, dict]`` .. code-block:: python3 import pm4py - performance_dfg, start_activities, end_activities = pm4py.discover_performance_dfg(dataframe, case_id_key='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp') + performance_dfg, start_activities, end_activities = pm4py.discover_performance_dfg( + dataframe, + case_id_key='case:concept:name', + activity_key='concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -200,17 +228,23 @@ def discover_petri_net_alpha(log: Union[EventLog, pd.DataFrame], activity_key: s """ Discovers a Petri net using the Alpha Miner. - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A tuple containing the Petri net, initial marking, and final marking. :rtype: ``Tuple[PetriNet, Marking, Marking]`` .. code-block:: python3 import pm4py - net, im, fm = pm4py.discover_petri_net_alpha(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_alpha( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -226,18 +260,24 @@ def discover_petri_net_ilp(log: Union[EventLog, pd.DataFrame], alpha: float = 1. """ Discovers a Petri net using the ILP Miner. - :param log: event log / Pandas dataframe - :param alpha: noise threshold for the sequence encoding graph (1.0=no filtering, 0.0=greatest filtering) - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param alpha: Noise threshold for the sequence encoding graph (1.0=no filtering, 0.0=maximum filtering) (default: 1.0). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A tuple containing the Petri net, initial marking, and final marking. :rtype: ``Tuple[PetriNet, Marking, Marking]`` .. code-block:: python3 import pm4py - net, im, fm = pm4py.discover_petri_net_ilp(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_ilp( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -252,22 +292,31 @@ def discover_petri_net_ilp(log: Union[EventLog, pd.DataFrame], alpha: float = 1. return ilp_miner.apply(log, variant=ilp_miner.Variants.CLASSIC, parameters=parameters) -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="this method will be removed in a future release.") +@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="This method will be removed in a future release.") def discover_petri_net_alpha_plus(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Tuple[PetriNet, Marking, Marking]: """ - Discovers a Petri net using the Alpha+ algorithm + Discovers a Petri net using the Alpha+ algorithm. + + .. deprecated:: 2.3.0 + This method will be removed in version 3.0.0. Use other discovery methods instead. - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A tuple containing the Petri net, initial marking, and final marking. :rtype: ``Tuple[PetriNet, Marking, Marking]`` .. code-block:: python3 import pm4py - net, im, fm = pm4py.discover_petri_net_alpha_plus(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_alpha_plus( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -279,29 +328,33 @@ def discover_petri_net_alpha_plus(log: Union[EventLog, pd.DataFrame], activity_k return alpha_miner.apply(log, variant=alpha_miner.Variants.ALPHA_VERSION_PLUS, parameters=get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)) -def discover_petri_net_inductive(log: Union[EventLog, pd.DataFrame, DFG], multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, noise_threshold: float = 0.0, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", disable_fallthroughs: bool = False) -> Tuple[ - PetriNet, Marking, Marking]: +def discover_petri_net_inductive(log: Union[EventLog, pd.DataFrame, DFG], multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, noise_threshold: float = 0.0, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", disable_fallthroughs: bool = False) -> Tuple[PetriNet, Marking, Marking]: """ - Discovers a Petri net using the inductive miner algorithm. - - The basic idea of Inductive Miner is about detecting a 'cut' in the log (e.g. sequential cut, parallel cut, concurrent cut and loop cut) and then recur on sublogs, which were found applying the cut, until a base case is found. The Directly-Follows variant avoids the recursion on the sublogs but uses the Directly Follows graph. + Discovers a Petri net using the Inductive Miner algorithm. - Inductive miner models usually make extensive use of hidden transitions, especially for skipping/looping on a portion on the model. Furthermore, each visible transition has a unique label (there are no transitions in the model that share the same label). + The Inductive Miner detects a 'cut' in the log (e.g., sequential, parallel, concurrent, loop) and recursively applies the algorithm to sublogs until a base case is found. + Inductive miner models typically use hidden transitions for skipping or looping portions of the model, and each visible transition has a unique label. - :param log: event log / Pandas dataframe / typed DFG - :param noise_threshold: noise threshold (default: 0.0) - :param multi_processing: boolean that enables/disables multiprocessing in inductive miner - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param disable_fallthroughs: disable the Inductive Miner fall-throughs + :param log: Event log, Pandas DataFrame, or typed DFG. + :param multi_processing: Enables or disables multiprocessing in the Inductive Miner (default: constants.ENABLE_MULTIPROCESSING_DEFAULT). + :param noise_threshold: Noise threshold (default: 0.0). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :param disable_fallthroughs: Disables the Inductive Miner fall-throughs (default: False). + :return: A tuple containing the Petri net, initial marking, and final marking. :rtype: ``Tuple[PetriNet, Marking, Marking]`` .. code-block:: python3 import pm4py - net, im, fm = pm4py.discover_petri_net_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -319,24 +372,31 @@ def discover_petri_net_heuristics(log: Union[EventLog, pd.DataFrame], dependency and_threshold: float = 0.65, loop_two_threshold: float = 0.5, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Tuple[PetriNet, Marking, Marking]: """ - Discover a Petri net using the Heuristics Miner + Discovers a Petri net using the Heuristics Miner. - Heuristics Miner is an algorithm that acts on the Directly-Follows Graph, providing way to handle with noise and to find common constructs (dependency between two activities, AND). The output of the Heuristics Miner is an Heuristics Net, so an object that contains the activities and the relationships between them. The Heuristics Net can be then converted into a Petri net. The paper can be visited by clicking on the upcoming link: this link). + Heuristics Miner operates on the Directly-Follows Graph, handling noise and identifying common constructs such as dependencies between activities and parallelism. + The output is a Heuristics Net, which can then be converted into a Petri net. - :param log: event log / Pandas dataframe - :param dependency_threshold: dependency threshold (default: 0.5) - :param and_threshold: AND threshold (default: 0.65) - :param loop_two_threshold: loop two threshold (default: 0.5) - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param dependency_threshold: Dependency threshold (default: 0.5). + :param and_threshold: AND threshold for parallelism (default: 0.65). + :param loop_two_threshold: Loop two threshold (default: 0.5). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A tuple containing the Petri net, initial marking, and final marking. :rtype: ``Tuple[PetriNet, Marking, Marking]`` .. code-block:: python3 import pm4py - net, im, fm = pm4py.discover_petri_net_heuristics(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + net, im, fm = pm4py.discover_petri_net_heuristics( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -358,26 +418,31 @@ def discover_petri_net_heuristics(log: Union[EventLog, pd.DataFrame], dependency def discover_process_tree_inductive(log: Union[EventLog, pd.DataFrame, DFG], noise_threshold: float = 0.0, multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", disable_fallthroughs: bool = False) -> ProcessTree: """ - Discovers a process tree using the inductive miner algorithm + Discovers a Process Tree using the Inductive Miner algorithm. - The basic idea of Inductive Miner is about detecting a 'cut' in the log (e.g. sequential cut, parallel cut, concurrent cut and loop cut) and then recur on sublogs, which were found applying the cut, until a base case is found. The Directly-Follows variant avoids the recursion on the sublogs but uses the Directly Follows graph. + The Inductive Miner detects a 'cut' in the log (e.g., sequential, parallel, concurrent, loop) and recursively applies the algorithm to sublogs until a base case is found. + Inductive miner models typically use hidden transitions for skipping or looping portions of the model, and each visible transition has a unique label. - Inductive miner models usually make extensive use of hidden transitions, especially for skipping/looping on a portion on the model. Furthermore, each visible transition has a unique label (there are no transitions in the model that share the same label). - - :param log: event log / Pandas dataframe / typed DFG - :param noise_threshold: noise threshold (default: 0.0) - :param activity_key: attribute to be used for the activity - :param multi_processing: boolean that enables/disables multiprocessing in inductive miner - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param disable_fallthroughs: disable the Inductive Miner fall-throughs + :param log: Event log, Pandas DataFrame, or typed DFG. + :param noise_threshold: Noise threshold (default: 0.0). + :param multi_processing: Enables or disables multiprocessing in the Inductive Miner (default: constants.ENABLE_MULTIPROCESSING_DEFAULT). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :param disable_fallthroughs: Disables the Inductive Miner fall-throughs (default: False). + :return: A ProcessTree object. :rtype: ``ProcessTree`` .. code-block:: python3 import pm4py - process_tree = pm4py.discover_process_tree_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + process_tree = pm4py.discover_process_tree_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -404,27 +469,34 @@ def discover_heuristics_net(log: Union[EventLog, pd.DataFrame], dependency_thres and_threshold: float = 0.65, loop_two_threshold: float = 0.5, min_act_count: int = 1, min_dfg_occurrences: int = 1, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", decoration: str = "frequency") -> HeuristicsNet: """ - Discovers an heuristics net - - Heuristics Miner is an algorithm that acts on the Directly-Follows Graph, providing way to handle with noise and to find common constructs (dependency between two activities, AND). The output of the Heuristics Miner is an Heuristics Net, so an object that contains the activities and the relationships between them. The Heuristics Net can be then converted into a Petri net. The paper can be visited by clicking on the upcoming link: this link). - - :param log: event log / Pandas dataframe - :param dependency_threshold: dependency threshold (default: 0.5) - :param and_threshold: AND threshold (default: 0.65) - :param loop_two_threshold: loop two threshold (default: 0.5) - :param min_act_count: minimum number of occurrences per activity in order to be included in the discovery - :param min_dfg_occurrences: minimum number of occurrences per arc in the DFG in order to be included in the discovery - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param decoration: the decoration that should be used (frequency, performance) + Discovers a Heuristics Net. + + Heuristics Miner operates on the Directly-Follows Graph, handling noise and identifying common constructs such as dependencies between activities and parallelism. + The output is a Heuristics Net, which can then be converted into a Petri net. + + :param log: Event log or Pandas DataFrame. + :param dependency_threshold: Dependency threshold (default: 0.5). + :param and_threshold: AND threshold for parallelism (default: 0.65). + :param loop_two_threshold: Loop two threshold (default: 0.5). + :param min_act_count: Minimum number of occurrences per activity to be included in the discovery (default: 1). + :param min_dfg_occurrences: Minimum number of occurrences per arc in the DFG to be included in the discovery (default: 1). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :param decoration: The decoration to be used ("frequency" or "performance") (default: "frequency"). + :return: A HeuristicsNet object. :rtype: ``HeuristicsNet`` .. code-block:: python3 import pm4py - heu_net = pm4py.discover_heuristics_net(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + heu_net = pm4py.discover_heuristics_net( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -449,21 +521,30 @@ def discover_heuristics_net(log: Union[EventLog, pd.DataFrame], dependency_thres def derive_minimum_self_distance(log: Union[DataFrame, EventLog, EventStream], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[str, int]: """ - This algorithm computes the minimum self-distance for each activity observed in an event log. - The self distance of a in is infinity, of a in is 0, in is 1, etc. - The activity key 'concept:name' is used. + Computes the minimum self-distance for each activity observed in an event log. - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + The self-distance of activity `a` in `` is infinity, + in `` is 0, + in `` is 1, + etc. The activity key 'concept:name' is used. + + :param log: Event log or Pandas DataFrame. + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A dictionary mapping each activity to its minimum self-distance. :rtype: ``Dict[str, int]`` .. code-block:: python3 import pm4py - msd = pm4py.derive_minimum_self_distance(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + msd = pm4py.derive_minimum_self_distance( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -478,16 +559,24 @@ def derive_minimum_self_distance(log: Union[DataFrame, EventLog, EventStream], a def discover_footprints(*args: Union[EventLog, Tuple[PetriNet, Marking, Marking], ProcessTree]) -> Union[ List[Dict[str, Any]], Dict[str, Any]]: """ - Discovers the footprints out of the provided event log / process model + Discovers the footprints from the provided event log or process model. + + Footprints are a high-level representation of the behavior captured in the event log or process model. - :param args: event log / process model + :param args: Event log, process model (Petri net and markings), or ProcessTree. + :return: A list of footprint dictionaries or a single footprint dictionary. :rtype: ``Union[List[Dict[str, Any]], Dict[str, Any]]`` .. code-block:: python3 import pm4py - footprints = pm4py.discover_footprints(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + footprints = pm4py.discover_footprints( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ from pm4py.algo.discovery.footprints import algorithm as fp_discovery return fp_discovery.apply(*args) @@ -495,23 +584,27 @@ def discover_footprints(*args: Union[EventLog, Tuple[PetriNet, Marking, Marking] def discover_eventually_follows_graph(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[Tuple[str, str], int]: """ - Gets the eventually follows graph from a log object. + Generates the Eventually-Follows Graph from a log. - The eventually follows graph is a dictionary associating to every - couple of activities which are eventually following each other the - number of occurrences of this relation. + The Eventually-Follows Graph is a dictionary that maps each pair of activities to the number of times one activity eventually follows the other in the log. - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A dictionary mapping each pair of activities to the count of their eventually-follows relationship. :rtype: ``Dict[Tuple[str, str], int]`` .. code-block:: python3 import pm4py - efg = pm4py.discover_eventually_follows_graph(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + efg = pm4py.discover_eventually_follows_graph( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -530,26 +623,31 @@ def discover_eventually_follows_graph(log: Union[EventLog, pd.DataFrame], activi def discover_bpmn_inductive(log: Union[EventLog, pd.DataFrame, DFG], noise_threshold: float = 0.0, multi_processing: bool = constants.ENABLE_MULTIPROCESSING_DEFAULT, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", disable_fallthroughs: bool = False) -> BPMN: """ - Discovers a BPMN using the Inductive Miner algorithm - - The basic idea of Inductive Miner is about detecting a 'cut' in the log (e.g. sequential cut, parallel cut, concurrent cut and loop cut) and then recur on sublogs, which were found applying the cut, until a base case is found. The Directly-Follows variant avoids the recursion on the sublogs but uses the Directly Follows graph. + Discovers a BPMN model using the Inductive Miner algorithm. - Inductive miner models usually make extensive use of hidden transitions, especially for skipping/looping on a portion on the model. Furthermore, each visible transition has a unique label (there are no transitions in the model that share the same label). + The Inductive Miner detects a 'cut' in the log (e.g., sequential, parallel, concurrent, loop) and recursively applies the algorithm to sublogs until a base case is found. + Inductive miner models typically use hidden transitions for skipping or looping portions of the model, and each visible transition has a unique label. - :param log: event log / Pandas dataframe / typed DFG - :param noise_threshold: noise threshold (default: 0.0) - :param multi_processing: boolean that enables/disables multiprocessing in inductive miner - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param disable_fallthroughs: disable the Inductive Miner fall-throughs + :param log: Event log, Pandas DataFrame, or typed DFG. + :param noise_threshold: Noise threshold (default: 0.0). + :param multi_processing: Enables or disables multiprocessing in the Inductive Miner (default: constants.ENABLE_MULTIPROCESSING_DEFAULT). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :param disable_fallthroughs: Disables the Inductive Miner fall-throughs (default: False). + :return: A BPMN object representing the discovered BPMN model. :rtype: ``BPMN`` .. code-block:: python3 import pm4py - bpmn_graph = pm4py.discover_bpmn_inductive(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + bpmn_graph = pm4py.discover_bpmn_inductive( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -565,23 +663,30 @@ def discover_bpmn_inductive(log: Union[EventLog, pd.DataFrame, DFG], noise_thres def discover_transition_system(log: Union[EventLog, pd.DataFrame], direction: str = "forward", window: int = 2, view: str = "sequence", activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> TransitionSystem: """ - Discovers a transition system as described in the process mining book - "Process Mining: Data Science in Action" + Discovers a Transition System from a log. - :param log: event log / Pandas dataframe - :param direction: direction in which the transition system is built (forward, backward) - :param window: window (2, 3, ...) - :param view: view to use in the construction of the states (sequence, set, multiset) - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + The Transition System is built based on the specified direction, window size, and view. It captures the transitions between states of activity sequences. + + :param log: Event log or Pandas DataFrame. + :param direction: Direction in which the transition system is built ("forward" or "backward") (default: "forward"). + :param window: Window size for state construction (e.g., 2, 3) (default: 2). + :param view: View to use in the construction of the states ("sequence", "set", "multiset") (default: "sequence"). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A TransitionSystem object representing the discovered transition system. :rtype: ``TransitionSystem`` .. code-block:: python3 import pm4py - transition_system = pm4py.discover_transition_system(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + transition_system = pm4py.discover_transition_system( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -601,19 +706,27 @@ def discover_transition_system(log: Union[EventLog, pd.DataFrame], direction: st def discover_prefix_tree(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Trie: """ - Discovers a prefix tree from the provided log object. + Discovers a Prefix Tree from the provided log. + + A Prefix Tree represents all the unique prefixes of activity sequences in the log. - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A Trie object representing the discovered prefix tree. :rtype: ``Trie`` .. code-block:: python3 import pm4py - prefix_tree = pm4py.discover_prefix_tree(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + prefix_tree = pm4py.discover_prefix_tree( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -630,33 +743,45 @@ def discover_prefix_tree(log: Union[EventLog, pd.DataFrame], activity_key: str = def discover_temporal_profile(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[Tuple[str, str], Tuple[float, float]]: """ - Discovers a temporal profile from a log object. + Discovers a Temporal Profile from a log. Implements the approach described in: Stertz, Florian, Jürgen Mangler, and Stefanie Rinderle-Ma. "Temporal Conformance Checking at Runtime based on Time-infused Process Models." arXiv preprint arXiv:2008.07262 (2020). - The output is a dictionary containing, for every couple of activities eventually following in at least a case of the log, - the average and the standard deviation of the difference of the timestamps. - - E.g. if the log has two cases: - - A (timestamp: 1980-01) B (timestamp: 1980-03) C (timestamp: 1980-06) - A (timestamp: 1990-01) B (timestamp: 1990-02) D (timestamp: 1990-03) + The output is a dictionary containing, for every pair of activities that eventually follow each other in at least one case of the log, + the average and the standard deviation of the time difference between their timestamps. + Example: + If the log has two cases: + - Case 1: A (timestamp: 1980-01) → B (timestamp: 1980-03) → C (timestamp: 1980-06) + - Case 2: A (timestamp: 1990-01) → B (timestamp: 1990-02) → D (timestamp: 1990-03) + The returned dictionary will contain: - {('A', 'B'): (1.5 months, 0.5 months), ('A', 'C'): (5 months, 0), ('A', 'D'): (2 months, 0)} - - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + ``` + { + ('A', 'B'): (1.5 months, 0.5 months), + ('A', 'C'): (5 months, 0), + ('A', 'D'): (2 months, 0) + } + ``` + + :param log: Event log or Pandas DataFrame. + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A dictionary mapping each pair of activities to a tuple of (average time difference, standard deviation). :rtype: ``Dict[Tuple[str, str], Tuple[float, float]]`` .. code-block:: python3 import pm4py - temporal_profile = pm4py.discover_temporal_profile(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + temporal_profile = pm4py.discover_temporal_profile( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -673,40 +798,38 @@ def discover_temporal_profile(log: Union[EventLog, pd.DataFrame], activity_key: def discover_log_skeleton(log: Union[EventLog, pd.DataFrame], noise_threshold: float = 0.0, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[str, Any]: """ - Discovers a log skeleton from an event log. - - A log skeleton is a declarative model which consists of six different constraints: - - "directly_follows": specifies for some activities some strict bounds on the activities directly-following. For example, - 'A should be directly followed by B' and 'B should be directly followed by C'. - - "always_before": specifies that some activities may be executed only if some other activities are executed somewhen before - in the history of the case. - For example, 'C should always be preceded by A' - - "always_after": specifies that some activities should always trigger the execution of some other activities - in the future history of the case. - For example, 'A should always be followed by C' - - "equivalence": specifies that a given couple of activities should happen with the same number of occurrences inside - a case. - For example, 'B and C should always happen the same number of times'. - - "never_together": specifies that a given couple of activities should never happen together in the history of the case. - For example, 'there should be no case containing both C and D'. - - "activ_occurrences": specifies the allowed number of occurrences per activity: - E.g. A is allowed to be executed 1 or 2 times, B is allowed to be executed 1 or 2 or 3 or 4 times. + Discovers a Log Skeleton from an event log. + + A Log Skeleton is a declarative model consisting of six different constraints: + - **directly_follows**: Specifies strict bounds on activities that directly follow each other. Example: 'A should be directly followed by B' and 'B should be directly followed by C'. + - **always_before**: Specifies that some activities may only be executed if certain other activities have been executed earlier in the case. Example: 'C should always be preceded by A'. + - **always_after**: Specifies that certain activities should always trigger the execution of some other activities later in the case. Example: 'A should always be followed by C'. + - **equivalence**: Specifies that a given pair of activities should occur the same number of times within a case. Example: 'B and C should always occur the same number of times'. + - **never_together**: Specifies that a given pair of activities should never occur together in a case. Example: 'There should be no case containing both C and D'. + - **activ_occurrences**: Specifies allowed numbers of occurrences per activity. Example: 'Activity A can occur 1 or 2 times, and Activity B can occur 1 to 4 times'. Reference paper: Verbeek, H. M. W., and R. Medeiros de Carvalho. "Log skeletons: A classification approach to process discovery." arXiv preprint arXiv:1806.08247 (2018). - :param log: event log / Pandas dataframe - :param noise_threshold: noise threshold, acting as described in the paper. - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param noise_threshold: Noise threshold influencing the strictness of constraints (default: 0.0). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A dictionary representing the Log Skeleton with various constraints. :rtype: ``Dict[str, Any]`` .. code-block:: python3 import pm4py - log_skeleton = pm4py.discover_log_skeleton(dataframe, noise_threshold=0.1, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + log_skeleton = pm4py.discover_log_skeleton( + dataframe, + noise_threshold=0.1, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -729,15 +852,16 @@ def discover_declare(log: Union[EventLog, pd.DataFrame], allowed_templates: Opti Reference paper: F. M. Maggi, A. J. Mooij and W. M. P. van der Aalst, "User-guided discovery of declarative process models," 2011 IEEE Symposium on Computational Intelligence and Data Mining (CIDM), Paris, France, 2011, pp. 192-199, doi: 10.1109/CIDM.2011.5949297. - :param log: event log / Pandas dataframe - :param allowed_templates: (optional) collection of templates to consider for the discovery - :param considered_activities: (optional) collection of activities to consider for the discovery - :param min_support_ratio: (optional, decided automatically otherwise) minimum percentage of cases (over the entire set of cases of the log) for which the discovered rules apply - :param min_confidence_ratio: (optional, decided automatically otherwise) minimum percentage of cases (over the rule's support) for which the discovered rules are valid - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[str, Any]`` + :param log: Event log or Pandas DataFrame. + :param allowed_templates: (Optional) Set of DECLARE templates to consider for discovery. + :param considered_activities: (Optional) Set of activities to consider for discovery. + :param min_support_ratio: (Optional) Minimum percentage of cases for which the discovered rules apply. + :param min_confidence_ratio: (Optional) Minimum percentage of cases for which the discovered rules are valid, based on the rule's support. + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A dictionary representing the discovered DECLARE model with constraints and their parameters. + :rtype: ``Dict[str, Dict[Any, Dict[str, int]]]`` .. code-block:: python3 @@ -767,18 +891,19 @@ def discover_powl(log: Union[EventLog, pd.DataFrame], variant=None, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> POWL: """ - Discovers a POWL model from an event log. + Discovers a POWL (Partially Ordered Workflow Language) model from an event log. Reference paper: Kourani, Humam, and Sebastiaan J. van Zelst. "POWL: partially ordered workflow language." International Conference on Business Process Management. Cham: Springer Nature Switzerland, 2023. - :param log: event log / Pandas dataframe - :param variant: variant of the algorithm - :param filtering_weight_factor: accepts values 0 <= x < 1 - :param order_graph_filtering_threshold: accepts values 0.5 < x <= 1 - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param variant: Variant of the POWL discovery algorithm to use. + :param filtering_weight_factor: Factoring threshold for filtering weights, accepts values 0 <= x < 1 (default: 0.0). + :param order_graph_filtering_threshold: Filtering threshold for the order graph, valid for the DYNAMIC_CLUSTERING variant, accepts values 0.5 < x <= 1 (default: None). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :return: A POWL object representing the discovered POWL model. :rtype: ``POWL`` .. code-block:: python3 @@ -786,7 +911,10 @@ def discover_powl(log: Union[EventLog, pd.DataFrame], variant=None, import pm4py log = pm4py.read_xes('tests/input_data/receipt.xes') - powl_model = pm4py.discover_powl(log, activity_key='concept:name') + powl_model = pm4py.discover_powl( + log, + activity_key='concept:name' + ) print(powl_model) """ from pm4py.algo.discovery.powl.inductive.variants.dynamic_clustering_frequency.dynamic_clustering_frequency_partial_order_cut import \ @@ -810,7 +938,7 @@ def discover_powl(log: Union[EventLog, pd.DataFrame], variant=None, if variant is POWLDiscoveryVariant.DYNAMIC_CLUSTERING: properties[ORDER_FREQUENCY_RATIO] = order_graph_filtering_threshold else: - raise Exception("the order graph filtering threshold can only be used for the variant DYNAMIC_CLUSTERING") + raise Exception("The order graph filtering threshold can only be used for the DYNAMIC_CLUSTERING variant.") properties["filtering_threshold"] = filtering_weight_factor @@ -821,45 +949,45 @@ def discover_powl(log: Union[EventLog, pd.DataFrame], variant=None, def discover_batches(log: Union[EventLog, pd.DataFrame], merge_distance: int = 15 * 60, min_batch_size: int = 2, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", resource_key: str = "org:resource") -> List[ Tuple[Tuple[str, str], int, Dict[str, Any]]]: """ - Discover batches from the provided log object - - We say that an activity is executed in batches by a given resource when the resource executes several times the same activity in a short period of time. - - Identifying such activities may identify points of the process that can be automated, since the activity of the person may be repetitive. - - The following categories of batches are detected: - - Simultaneous (all the events in the batch have identical start and end timestamps) - - Batching at start (all the events in the batch have identical start timestamp) - - Batching at end (all the events in the batch have identical end timestamp) - - Sequential batching (for all the consecutive events, the end of the first is equal to the start of the second) - - Concurrent batching (for all the consecutive events that are not sequentially matched) - - The approach has been described in the following paper: - Martin, N., Swennen, M., Depaire, B., Jans, M., Caris, A., & Vanhoof, K. (2015, December). Batch Processing: - Definition and Event Log Identification. In SIMPDA (pp. 137-140). - - The output is a (sorted) list containing tuples. Each tuple contain: - - Index 0: the activity-resource for which at least one batch has been detected - - Index 1: the number of batches for the given activity-resource - - Index 2: a list containing all the batches. Each batch is described by: - # The start timestamp of the batch - # The complete timestamp of the batch - # The list of events that are executed in the batch - - :param log: event log / Pandas dataframe - :param merge_distance: the maximum time distance between non-overlapping intervals in order for them to be considered belonging to the same batch (default: 15*60 15 minutes) - :param min_batch_size: the minimum number of events for a batch to be considered (default: 2) - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param resource_key: attribute to be used as resource + Discovers batches from the provided log. + + An activity is executed in batches by a given resource when the resource performs the same activity multiple times in a short period. + Identifying such activities may highlight repetitive tasks that could be automated. + + The following batch categories are detected: + - **Simultaneous**: All events in the batch have identical start and end timestamps. + - **Batching at Start**: All events in the batch have identical start timestamps. + - **Batching at End**: All events in the batch have identical end timestamps. + - **Sequential Batching**: Consecutive events have the end of the first equal to the start of the second. + - **Concurrent Batching**: Consecutive events that do not match sequentially. + + Reference paper: + Martin, N., Swennen, M., Depaire, B., Jans, M., Caris, A., & Vanhoof, K. (2015, December). Batch Processing: Definition and Event Log Identification. In SIMPDA (pp. 137-140). + + :param log: Event log or Pandas DataFrame. + :param merge_distance: Maximum time distance (in seconds) between non-overlapping intervals to consider them part of the same batch (default: 900 seconds, i.e., 15 minutes). + :param min_batch_size: Minimum number of events required to form a batch (default: 2). + :param activity_key: Attribute to be used for the activity (default: "concept:name"). + :param timestamp_key: Attribute to be used for the timestamp (default: "time:timestamp"). + :param case_id_key: Attribute to be used as case identifier (default: "case:concept:name"). + :param resource_key: Attribute to be used as resource (default: "org:resource"). + :return: A sorted list of tuples, each containing: + - The (activity, resource) pair. + - The number of batches for the given activity-resource. + - A dictionary with batch details. :rtype: ``List[Tuple[Tuple[str, str], int, Dict[str, Any]]]`` .. code-block:: python3 import pm4py - batches = pm4py.discover_log_skeleton(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp', resource_key='org:resource') + batches = pm4py.discover_batches( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp', + resource_key='org:resource' + ) """ __event_log_deprecation_warning(log) diff --git a/pm4py/filtering.py b/pm4py/filtering.py index 29f5e5597..3d87c8e62 100644 --- a/pm4py/filtering.py +++ b/pm4py/filtering.py @@ -20,7 +20,7 @@ Contact: info@processintelligence.solutions ''' __doc__ = """ -The ``pm4py.filtering`` module contains the filtering features offered in ``pm4py`` +The ``pm4py.filtering`` module contains the filtering features offered in ``pm4py``. """ from typing import Union, Set, List, Tuple, Collection, Any, Dict, Optional @@ -37,25 +37,39 @@ import datetime -def filter_log_relative_occurrence_event_attribute(log: Union[EventLog, pd.DataFrame], min_relative_stake: float, attribute_key : str = xes_constants.DEFAULT_NAME_KEY, level="cases", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_log_relative_occurrence_event_attribute( + log: Union[EventLog, pd.DataFrame], + min_relative_stake: float, + attribute_key: str = xes_constants.DEFAULT_NAME_KEY, + level: str = "cases", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filters the event log keeping only the events having an attribute value which occurs: - - in at least the specified (min_relative_stake) percentage of events, when level="events" - - in at least the specified (min_relative_stake) percentage of cases, when level="cases" + Filters the event log, keeping only the events that have an attribute value which occurs: + - in at least the specified (min_relative_stake) percentage of events when level="events", + - in at least the specified (min_relative_stake) percentage of cases when level="cases". - :param log: event log / Pandas dataframe - :param min_relative_stake: minimum percentage of cases (expressed as a number between 0 and 1) in which the attribute should occur. - :param attribute_key: the attribute to filter - :param level: the level of the filter (if level="events", then events / if level="cases", then cases) - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param min_relative_stake: Minimum percentage of cases (expressed as a number between 0 and 1) in which the attribute should occur. + :param attribute_key: The attribute to filter. + :param level: The level of the filter (if level="events", then events; if level="cases", then cases). + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_log_relative_occurrence_event_attribute(dataframe, 0.5, level='cases', case_id_key='case:concept:name', timestamp_key='time:timestamp') + filtered_dataframe = pm4py.filter_log_relative_occurrence_event_attribute( + dataframe, + 0.5, + attribute_key='concept:name', + level='cases', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -73,24 +87,36 @@ def filter_log_relative_occurrence_event_attribute(log: Union[EventLog, pd.DataF return attributes_filter.filter_log_relative_occurrence_event_attribute(log, min_relative_stake, parameters=parameters) -def filter_start_activities(log: Union[EventLog, pd.DataFrame], activities: Union[Set[str], List[str]], retain: bool = True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> \ -Union[EventLog, pd.DataFrame]: +def filter_start_activities( + log: Union[EventLog, pd.DataFrame], + activities: Union[Set[str], List[str]], + retain: bool = True, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filter cases having a start activity in the provided list + Filters cases that have a start activity in the provided list. - :param log: event log / Pandas dataframe - :param activities: collection of start activities - :param retain: if True, we retain the traces containing the given start activities, if false, we drop the traces - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param activities: Collection of start activities. + :param retain: If True, retains the traces containing the given start activities; if False, drops the traces. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_start_activities(dataframe, ['Act. A'], activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + filtered_dataframe = pm4py.filter_start_activities( + dataframe, + ['Act. A'], + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -99,33 +125,43 @@ def filter_start_activities(log: Union[EventLog, pd.DataFrame], activities: Unio check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) from pm4py.algo.filtering.pandas.start_activities import start_activities_filter parameters[start_activities_filter.Parameters.POSITIVE] = retain - return start_activities_filter.apply(log, activities, - parameters=parameters) + return start_activities_filter.apply(log, activities, parameters=parameters) else: from pm4py.algo.filtering.log.start_activities import start_activities_filter parameters[start_activities_filter.Parameters.POSITIVE] = retain - return start_activities_filter.apply(log, activities, - parameters=parameters) + return start_activities_filter.apply(log, activities, parameters=parameters) -def filter_end_activities(log: Union[EventLog, pd.DataFrame], activities: Union[Set[str], List[str]], retain: bool = True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[ - EventLog, pd.DataFrame]: +def filter_end_activities( + log: Union[EventLog, pd.DataFrame], + activities: Union[Set[str], List[str]], + retain: bool = True, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filter cases having an end activity in the provided list + Filters cases that have an end activity in the provided list. - :param log: event log / Pandas dataframe - :param activities: collection of end activities - :param retain: if True, we retain the traces containing the given end activities, if false, we drop the traces - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param activities: Collection of end activities. + :param retain: If True, retains the traces containing the given end activities; if False, drops the traces. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_end_activities(dataframe, ['Act. Z'], activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + filtered_dataframe = pm4py.filter_end_activities( + dataframe, + ['Act. Z'], + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -134,33 +170,42 @@ def filter_end_activities(log: Union[EventLog, pd.DataFrame], activities: Union check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) from pm4py.algo.filtering.pandas.end_activities import end_activities_filter parameters[end_activities_filter.Parameters.POSITIVE] = retain - return end_activities_filter.apply(log, activities, - parameters=parameters) + return end_activities_filter.apply(log, activities, parameters=parameters) else: from pm4py.algo.filtering.log.end_activities import end_activities_filter parameters[end_activities_filter.Parameters.POSITIVE] = retain - return end_activities_filter.apply(log, activities, - parameters=parameters) + return end_activities_filter.apply(log, activities, parameters=parameters) -def filter_event_attribute_values(log: Union[EventLog, pd.DataFrame], attribute_key: str, values: Union[Set[str], List[str]], - level: str = "case", retain: bool = True, case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_event_attribute_values( + log: Union[EventLog, pd.DataFrame], + attribute_key: str, + values: Union[Set[str], List[str]], + level: str = "case", + retain: bool = True, + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filter a log object on the values of some event attribute + Filters a log object based on the values of a specified event attribute. - :param log: event log / Pandas dataframe - :param attribute_key: attribute to filter - :param values: admitted (or forbidden) values - :param level: specifies how the filter should be applied ('case' filters the cases where at least one occurrence happens, 'event' filter the events eventually trimming the cases) - :param retain: specifies if the values should be kept or removed - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param attribute_key: Attribute to filter. + :param values: Admitted or forbidden values. + :param level: Specifies how the filter should be applied ('case' filters the cases where at least one occurrence happens; 'event' filters the events, potentially trimming the cases). + :param retain: Specifies if the values should be kept or removed. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_event_attribute_values(dataframe, 'concept:name', ['Act. A', 'Act. Z'], case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_event_attribute_values( + dataframe, + 'concept:name', + ['Act. A', 'Act. Z'], + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -171,8 +216,7 @@ def filter_event_attribute_values(log: Union[EventLog, pd.DataFrame], attribute_ from pm4py.algo.filtering.pandas.attributes import attributes_filter if level == "event": parameters[attributes_filter.Parameters.POSITIVE] = retain - return attributes_filter.apply_events(log, values, - parameters=parameters) + return attributes_filter.apply_events(log, values, parameters=parameters) elif level == "case": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply(log, values, parameters=parameters) @@ -180,30 +224,39 @@ def filter_event_attribute_values(log: Union[EventLog, pd.DataFrame], attribute_ from pm4py.algo.filtering.log.attributes import attributes_filter if level == "event": parameters[attributes_filter.Parameters.POSITIVE] = retain - return attributes_filter.apply_events(log, values, - parameters=parameters) + return attributes_filter.apply_events(log, values, parameters=parameters) elif level == "case": parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply(log, values, parameters=parameters) -def filter_trace_attribute_values(log: Union[EventLog, pd.DataFrame], attribute_key: str, values: Union[Set[str], List[str]], - retain: bool = True, case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_trace_attribute_values( + log: Union[EventLog, pd.DataFrame], + attribute_key: str, + values: Union[Set[str], List[str]], + retain: bool = True, + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filter a log on the values of a trace attribute + Filters a log based on the values of a specified trace attribute. - :param log: event log / Pandas dataframe - :param attribute_key: attribute to filter - :param values: collection of values to filter - :param retain: boolean value (keep/discard matching traces) - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param attribute_key: Attribute to filter. + :param values: Collection of values to filter. + :param retain: Boolean value indicating whether to keep or discard matching traces. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_trace_attribute_values(dataframe, 'case:creator', ['Mike'], case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_trace_attribute_values( + dataframe, + 'case:creator', + ['Mike'], + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -213,32 +266,43 @@ def filter_trace_attribute_values(log: Union[EventLog, pd.DataFrame], attribute_ check_pandas_dataframe_columns(log, case_id_key=case_id_key) from pm4py.algo.filtering.pandas.attributes import attributes_filter parameters[attributes_filter.Parameters.POSITIVE] = retain - return attributes_filter.apply(log, values, - parameters=parameters) + return attributes_filter.apply(log, values, parameters=parameters) else: from pm4py.algo.filtering.log.attributes import attributes_filter parameters[attributes_filter.Parameters.POSITIVE] = retain return attributes_filter.apply_trace_attribute(log, values, parameters=parameters) -def filter_variants(log: Union[EventLog, pd.DataFrame], variants: Union[Set[str], List[str], List[Tuple[str]]], retain: bool = True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[ - EventLog, pd.DataFrame]: +def filter_variants( + log: Union[EventLog, pd.DataFrame], + variants: Union[Set[str], List[str], List[Tuple[str]]], + retain: bool = True, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filter a log on a specified set of variants + Filters a log based on a specified set of variants. - :param log: event log / Pandas dataframe - :param variants: collection of variants to filter; A variant should be specified as a list of tuples of activity names, e.g., [('a', 'b', 'c')] - :param retain: boolean; if True all traces conforming to the specified variants are retained; if False, all those traces are removed - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param variants: Collection of variants to filter. A variant should be specified as a list of tuples of activity names, e.g., [('a', 'b', 'c')]. + :param retain: Boolean indicating whether to retain (if True) or remove (if False) traces conforming to the specified variants. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_variants(dataframe, [('Act. A', 'Act. B', 'Act. Z'), ('Act. A', 'Act. C', 'Act. Z')], activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + filtered_dataframe = pm4py.filter_variants( + dataframe, + [('Act. A', 'Act. B', 'Act. Z'), ('Act. A', 'Act. C', 'Act. Z')], + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -248,35 +312,45 @@ def filter_variants(log: Union[EventLog, pd.DataFrame], variants: Union[Set[str check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) from pm4py.algo.filtering.pandas.variants import variants_filter parameters[variants_filter.Parameters.POSITIVE] = retain - return variants_filter.apply(log, variants, - parameters=parameters) + return variants_filter.apply(log, variants, parameters=parameters) else: from pm4py.algo.filtering.log.variants import variants_filter parameters[variants_filter.Parameters.POSITIVE] = retain - return variants_filter.apply(log, variants, - parameters=parameters) + return variants_filter.apply(log, variants, parameters=parameters) -def filter_directly_follows_relation(log: Union[EventLog, pd.DataFrame], relations: List[str], retain: bool = True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> \ - Union[EventLog, pd.DataFrame]: +def filter_directly_follows_relation( + log: Union[EventLog, pd.DataFrame], + relations: List[str], + retain: bool = True, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Retain traces that contain any of the specified 'directly follows' relations. - For example, if relations == [('a','b'),('a','c')] and log [,,] + Retains traces that contain any of the specified 'directly follows' relations. + For example, if relations == [('a','b'),('a','c')] and log [,,], the resulting log will contain traces describing [,]. - :param log: event log / Pandas dataframe - :param relations: list of activity name pairs, which are allowed/forbidden paths - :param retain: parameter that says whether the paths should be kept/removed - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param relations: List of activity name pairs, representing allowed or forbidden paths. + :param retain: Boolean indicating whether the paths should be kept (if True) or removed (if False). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_directly_follows_relation(dataframe, [('A','B'),('A','C')], activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + filtered_dataframe = pm4py.filter_directly_follows_relation( + dataframe, + [('A', 'B'), ('A', 'C')], + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -291,26 +365,38 @@ def filter_directly_follows_relation(log: Union[EventLog, pd.DataFrame], relatio return paths_filter.apply(log, relations, parameters=parameters) -def filter_eventually_follows_relation(log: Union[EventLog, pd.DataFrame], relations: List[str], retain: bool = True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> \ - Union[EventLog, pd.DataFrame]: +def filter_eventually_follows_relation( + log: Union[EventLog, pd.DataFrame], + relations: List[str], + retain: bool = True, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Retain traces that contain any of the specified 'eventually follows' relations. - For example, if relations == [('a','b'),('a','c')] and log [,,] + Retains traces that contain any of the specified 'eventually follows' relations. + For example, if relations == [('a','b'),('a','c')] and log [,,], the resulting log will contain traces describing [,,]. - :param log: event log / Pandas dataframe - :param relations: list of activity name pairs, which are allowed/forbidden paths - :param retain: parameter that says whether the paths should be kept/removed - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param relations: List of activity name pairs, representing allowed or forbidden paths. + :param retain: Boolean indicating whether the paths should be kept (if True) or removed (if False). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_eventually_follows_relation(dataframe, [('A','B'),('A','C')], activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + filtered_dataframe = pm4py.filter_eventually_follows_relation( + dataframe, + [('A', 'B'), ('A', 'C')], + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -323,8 +409,7 @@ def filter_eventually_follows_relation(log: Union[EventLog, pd.DataFrame], relat else: cases = set(log[case_id_key].to_numpy().tolist()) for path in relations: - filt_log = ltl_checker.eventually_follows(log, path, - parameters=parameters) + filt_log = ltl_checker.eventually_follows(log, path, parameters=parameters) this_traces = set(filt_log[case_id_key].to_numpy().tolist()) if retain: cases = cases.union(this_traces) @@ -339,41 +424,75 @@ def filter_eventually_follows_relation(log: Union[EventLog, pd.DataFrame], relat else: cases = set(id(trace) for trace in log) for path in relations: - filt_log = ltl_checker.eventually_follows(log, path, - parameters=parameters) + filt_log = ltl_checker.eventually_follows(log, path, parameters=parameters) this_traces = set(id(trace) for trace in filt_log) if retain: cases = cases.union(this_traces) else: cases = cases.intersection(this_traces) - filtered_log = EventLog(attributes=log.attributes, extensions=log.extensions, omni_present=log.omni_present, - classifiers=log.classifiers, properties=log.properties) + filtered_log = EventLog( + attributes=log.attributes, + extensions=log.extensions, + omni_present=log.omni_present, + classifiers=log.classifiers, + properties=log.properties + ) for trace in log: if id(trace) in cases: filtered_log.append(trace) return filtered_log -def filter_time_range(log: Union[EventLog, pd.DataFrame], dt1: str, dt2: str, mode="events", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[ - EventLog, pd.DataFrame]: - """ - Filter a log on a time interval - - :param log: event log / Pandas dataframe - :param dt1: left extreme of the interval - :param dt2: right extreme of the interval - :param mode: modality of filtering (events, traces_contained, traces_intersecting). events: any event that fits the time frame is retained; traces_contained: any trace completely contained in the timeframe is retained; traces_intersecting: any trace intersecting with the time-frame is retained. - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` +def filter_time_range( + log: Union[EventLog, pd.DataFrame], + dt1: str, + dt2: str, + mode: str = "events", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: + """ + Filters a log based on a time interval. + + :param log: Event log or Pandas DataFrame. + :param dt1: Left extreme of the interval. + :param dt2: Right extreme of the interval. + :param mode: Modality of filtering ('events', 'traces_contained', 'traces_intersecting'). + - 'events': Any event that fits the time frame is retained. + - 'traces_contained': Any trace completely contained in the timeframe is retained. + - 'traces_intersecting': Any trace intersecting with the timeframe is retained. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe1 = pm4py.filter_time_range(dataframe, '2010-01-01 00:00:00', '2011-01-01 00:00:00', mode='traces_contained', case_id_key='case:concept:name', timestamp_key='time:timestamp') - filtered_dataframe1 = pm4py.filter_time_range(dataframe, '2010-01-01 00:00:00', '2011-01-01 00:00:00', mode='traces_intersecting', case_id_key='case:concept:name', timestamp_key='time:timestamp') - filtered_dataframe1 = pm4py.filter_time_range(dataframe, '2010-01-01 00:00:00', '2011-01-01 00:00:00', mode='events', case_id_key='case:concept:name', timestamp_key='time:timestamp') + filtered_dataframe1 = pm4py.filter_time_range( + dataframe, + '2010-01-01 00:00:00', + '2011-01-01 00:00:00', + mode='traces_contained', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + filtered_dataframe2 = pm4py.filter_time_range( + dataframe, + '2010-01-01 00:00:00', + '2011-01-01 00:00:00', + mode='traces_intersecting', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + filtered_dataframe3 = pm4py.filter_time_range( + dataframe, + '2010-01-01 00:00:00', + '2011-01-01 00:00:00', + mode='events', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -388,7 +507,7 @@ def filter_time_range(log: Union[EventLog, pd.DataFrame], dt1: str, dt2: str, mo return timestamp_filter.filter_traces_intersecting(log, dt1, dt2, parameters=properties) else: if constants.SHOW_INTERNAL_WARNINGS: - warnings.warn('mode provided: ' + mode + ' is not recognized; original log returned!') + warnings.warn(f"Mode provided: {mode} is not recognized; original log returned!") return log else: from pm4py.algo.filtering.log.timestamp import timestamp_filter @@ -400,11 +519,18 @@ def filter_time_range(log: Union[EventLog, pd.DataFrame], dt1: str, dt2: str, mo return timestamp_filter.filter_traces_intersecting(log, dt1, dt2, parameters=properties) else: if constants.SHOW_INTERNAL_WARNINGS: - warnings.warn('mode provided: ' + mode + ' is not recognized; original log returned!') + warnings.warn(f"Mode provided: {mode} is not recognized; original log returned!") return log -def filter_between(log: Union[EventLog, pd.DataFrame], act1: Union[str, List[str]], act2: Union[str, List[str]], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_between( + log: Union[EventLog, pd.DataFrame], + act1: Union[str, List[str]], + act2: Union[str, List[str]], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ Finds all the sub-cases leading from an event with activity "act1" to an event with activity "act2" in the log, and returns a log containing only them. @@ -426,19 +552,26 @@ def filter_between(log: Union[EventLog, pd.DataFrame], act1: Union[str, List[str B C (from the third case) B E F C (from the third case) - :param log: event log / Pandas dataframe - :param act1: source activity (or collection of activities) - :param act2: target activity (or collection of activities) - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param act1: Source activity or collection of activities. + :param act2: Target activity or collection of activities. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_between(dataframe, 'A', 'D', activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + filtered_dataframe = pm4py.filter_between( + dataframe, + 'A', + 'D', + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) @@ -452,22 +585,31 @@ def filter_between(log: Union[EventLog, pd.DataFrame], act1: Union[str, List[str return between_filter.apply(log, act1, act2, parameters=parameters) -def filter_case_size(log: Union[EventLog, pd.DataFrame], min_size: int, max_size: int, case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_case_size( + log: Union[EventLog, pd.DataFrame], + min_size: int, + max_size: int, + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filters the event log, keeping the cases having a length (number of events) included between min_size - and max_size + Filters the event log, keeping cases that have a length (number of events) between min_size and max_size. - :param log: event log / Pandas dataframe - :param min_size: minimum allowed number of events - :param max_size: maximum allowed number of events - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param min_size: Minimum allowed number of events. + :param max_size: Maximum allowed number of events. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_case_size(dataframe, 5, 10, case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_case_size( + dataframe, + 5, + 10, + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -475,31 +617,42 @@ def filter_case_size(log: Union[EventLog, pd.DataFrame], min_size: int, max_size if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log, case_id_key=case_id_key) from pm4py.algo.filtering.pandas.cases import case_filter - case_id = parameters[ - constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else constants.CASE_CONCEPT_NAME + case_id = parameters[constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in parameters else constants.CASE_CONCEPT_NAME return case_filter.filter_on_case_size(log, case_id, min_size, max_size) else: from pm4py.algo.filtering.log.cases import case_filter return case_filter.filter_on_case_size(log, min_size, max_size) -def filter_case_performance(log: Union[EventLog, pd.DataFrame], min_performance: float, max_performance: float, timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_case_performance( + log: Union[EventLog, pd.DataFrame], + min_performance: float, + max_performance: float, + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filters the event log, keeping the cases having a duration (the timestamp of the last event minus the timestamp - of the first event) included between min_performance and max_performance + Filters the event log, keeping cases that have a duration (the timestamp of the last event minus the timestamp + of the first event) between min_performance and max_performance. - :param log: event log / Pandas dataframe - :param min_performance: minimum allowed case duration - :param max_performance: maximum allowed case duration - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param min_performance: Minimum allowed case duration. + :param max_performance: Maximum allowed case duration. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_case_performance(dataframe, 3600.0, 86400.0, timestamp_key='time:timestamp', case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_case_performance( + dataframe, + 3600.0, + 86400.0, + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -513,23 +666,37 @@ def filter_case_performance(log: Union[EventLog, pd.DataFrame], min_performance: return case_filter.filter_case_performance(log, min_performance, max_performance, parameters=parameters) -def filter_activities_rework(log: Union[EventLog, pd.DataFrame], activity: str, min_occurrences: int = 2, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_activities_rework( + log: Union[EventLog, pd.DataFrame], + activity: str, + min_occurrences: int = 2, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filters the event log, keeping the cases where the specified activity occurs at least min_occurrences times. + Filters the event log, keeping cases where the specified activity occurs at least min_occurrences times. - :param log: event log / Pandas dataframe - :param activity: activity - :param min_occurrences: minimum desidered number of occurrences - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param activity: Activity to consider. + :param min_occurrences: Minimum desired number of occurrences. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_activities_rework(dataframe, 'Approve Order', 2, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_activities_rework( + dataframe, + 'Approve Order', + 2, + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -544,34 +711,50 @@ def filter_activities_rework(log: Union[EventLog, pd.DataFrame], activity: str, return rework_filter.apply(log, activity, parameters=parameters) -def filter_paths_performance(log: Union[EventLog, pd.DataFrame], path: Tuple[str, str], min_performance: float, max_performance: float, keep=True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: - """ - Filters the event log, either: - - (keep=True) keeping the cases having the specified path (tuple of 2 activities) with a duration included between min_performance and max_performance - - (keep=False) discarding the cases having the specified path with a duration included between min_performance and max_performance - - :param log: event log / Pandas dataframe - :param path: tuple of two activities (source_activity, target_activity) - :param min_performance: minimum allowed performance (of the path) - :param max_performance: maximum allowed performance (of the path) - :param keep: keep/discard the cases having the specified path with a duration included between min_performance and max_performance - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` +def filter_paths_performance( + log: Union[EventLog, pd.DataFrame], + path: Tuple[str, str], + min_performance: float, + max_performance: float, + keep: bool = True, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: + """ + Filters the event log based on the performance of specified paths. + + - If keep=True, retains cases having the specified path (tuple of 2 activities) with a duration between min_performance and max_performance. + - If keep=False, discards cases having the specified path with a duration between min_performance and max_performance. + + :param log: Event log or Pandas DataFrame. + :param path: Tuple of two activities (source_activity, target_activity). + :param min_performance: Minimum allowed performance of the path. + :param max_performance: Maximum allowed performance of the path. + :param keep: Boolean indicating whether to keep (if True) or discard (if False) the cases with the specified performance. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_paths_performance(dataframe, ('A', 'D'), 3600.0, 86400.0, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_paths_performance( + dataframe, + ('A', 'D'), + 3600.0, + 86400.0, + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) parameters["positive"] = keep - parameters["min_performance"] = min_performance - parameters["max_performance"] = max_performance path = tuple(path) if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) @@ -582,22 +765,34 @@ def filter_paths_performance(log: Union[EventLog, pd.DataFrame], path: Tuple[str return paths_filter.apply_performance(log, path, parameters=parameters) -def filter_variants_top_k(log: Union[EventLog, pd.DataFrame], k: int, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_variants_top_k( + log: Union[EventLog, pd.DataFrame], + k: int, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Keeps the top-k variants of the log + Keeps the top-k variants of the log. - :param log: event log / Pandas dataframe - :param k: number of variants that should be kept - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param k: Number of variants to keep. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_variants_top_k(dataframe, 5, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_variants_top_k( + dataframe, + 5, + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -611,25 +806,39 @@ def filter_variants_top_k(log: Union[EventLog, pd.DataFrame], k: int, activity_k return variants_filter.filter_variants_top_k(log, k, parameters=parameters) -def filter_variants_by_coverage_percentage(log: Union[EventLog, pd.DataFrame], min_coverage_percentage: float, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: - """ - Filters the variants of the log by a coverage percentage - (e.g., if min_coverage_percentage=0.4, and we have a log with 1000 cases, - of which 500 of the variant 1, 400 of the variant 2, and 100 of the variant 3, - the filter keeps only the traces of variant 1 and variant 2). - - :param log: event log / Pandas dataframe - :param min_coverage_percentage: minimum allowed percentage of coverage - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` +def filter_variants_by_coverage_percentage( + log: Union[EventLog, pd.DataFrame], + min_coverage_percentage: float, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: + """ + Filters the variants of the log based on a coverage percentage. + For example, if min_coverage_percentage=0.4 and the log has 1000 cases with: + - 500 cases of variant 1, + - 400 cases of variant 2, + - 100 cases of variant 3, + the filter keeps only the traces of variant 1 and variant 2. + + :param log: Event log or Pandas DataFrame. + :param min_coverage_percentage: Minimum allowed percentage of coverage. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_variants_by_coverage_percentage(dataframe, 0.1, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_variants_by_coverage_percentage( + dataframe, + 0.1, + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -643,67 +852,47 @@ def filter_variants_by_coverage_percentage(log: Union[EventLog, pd.DataFrame], m return variants_filter.filter_variants_by_coverage_percentage(log, min_coverage_percentage, parameters=parameters) -def filter_variants_by_maximum_coverage_percentage(log: Union[EventLog, pd.DataFrame], max_coverage_percentage: float, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_prefixes( + log: Union[EventLog, pd.DataFrame], + activity: str, + strict: bool = True, + first_or_last: str = "first", + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filters the variants of the log by a maximum coverage percentage - (e.g., if max_coverage_percentage=0.4, and we have a log with 1000 cases, - of which 500 of the variant 1, 400 of the variant 2, and 100 of the variant 3, - the filter keeps only the traces of variant 2 and variant 3). - - :param log: event log / Pandas dataframe - :param max_coverage_percentage: maximum allowed percentage of coverage - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` - - .. code-block:: python3 - - import pm4py - - filtered_dataframe = pm4py.filter_variants_by_maximum_coverage_percentage(dataframe, 0.1, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') - """ - if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") - __event_log_deprecation_warning(log) - - parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) - if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) - from pm4py.algo.filtering.pandas.variants import variants_filter - return variants_filter.filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=parameters) - else: - from pm4py.algo.filtering.log.variants import variants_filter - return variants_filter.filter_variants_by_maximum_coverage_percentage(log, max_coverage_percentage, parameters=parameters) - - -def filter_prefixes(log: Union[EventLog, pd.DataFrame], activity: str, strict=True, first_or_last="first", activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: - """ - Filters the log, keeping the prefixes to a given activity. E.g., for a log with traces: - - A,B,C,D - A,B,Z,A,B,C,D - A,B,C,D,C,E,C,F + Filters the log, keeping the prefixes leading up to a given activity. + For example, for a log with traces: + - A,B,C,D + - A,B,Z,A,B,C,D + - A,B,C,D,C,E,C,F The prefixes to "C" are respectively: - - A,B - A,B,Z,A,B - A,B - - :param log: event log / Pandas dataframe - :param activity: target activity of the filter - :param strict: applies the filter strictly (cuts the occurrences of the selected activity). - :param first_or_last: decides if the first or last occurrence of an activity should be selected as baseline for the filter. - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + - A,B + - A,B,Z,A,B + - A,B + + :param log: Event log or Pandas DataFrame. + :param activity: Target activity for the filter. + :param strict: Applies the filter strictly, cutting the occurrences of the selected activity. + :param first_or_last: Decides if the first or last occurrence of an activity should be selected as the baseline for the filter. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_prefixes(dataframe, 'Act. C', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_prefixes( + dataframe, + 'Act. C', + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -720,34 +909,47 @@ def filter_prefixes(log: Union[EventLog, pd.DataFrame], activity: str, strict=Tr return prefix_filter.apply(log, activity, parameters=parameters) -def filter_suffixes(log: Union[EventLog, pd.DataFrame], activity: str, strict=True, first_or_last="first", activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def filter_suffixes( + log: Union[EventLog, pd.DataFrame], + activity: str, + strict: bool = True, + first_or_last: str = "first", + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Filters the log, keeping the suffixes from a given activity. E.g., for a log with traces: - - A,B,C,D - A,B,Z,A,B,C,D - A,B,C,D,C,E,C,F + Filters the log, keeping the suffixes starting from a given activity. + For example, for a log with traces: + - A,B,C,D + - A,B,Z,A,B,C,D + - A,B,C,D,C,E,C,F The suffixes from "C" are respectively: - - D - D - D,C,E,C,F - - :param log: event log / Pandas dataframe - :param activity: target activity of the filter - :param strict: applies the filter strictly (cuts the occurrences of the selected activity). - :param first_or_last: decides if the first or last occurrence of an activity should be selected as baseline for the filter. - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + - D + - D + - D,C,E,C,F + + :param log: Event log or Pandas DataFrame. + :param activity: Target activity for the filter. + :param strict: Applies the filter strictly, cutting the occurrences of the selected activity. + :param first_or_last: Decides if the first or last occurrence of an activity should be selected as the baseline for the filter. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_prefixes(dataframe, 'Act. C', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_suffixes( + dataframe, + 'Act. C', + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -764,74 +966,115 @@ def filter_suffixes(log: Union[EventLog, pd.DataFrame], activity: str, strict=Tr return suffix_filter.apply(log, activity, parameters=parameters) -def filter_ocel_event_attribute(ocel: OCEL, attribute_key: str, attribute_values: Collection[Any], positive: bool = True) -> OCEL: +def filter_ocel_event_attribute( + ocel: OCEL, + attribute_key: str, + attribute_values: Collection[Any], + positive: bool = True +) -> OCEL: """ - Filters the object-centric event log on the provided event attributes values + Filters the object-centric event log based on the provided event attribute values. - :param ocel: object-centric event log - :param attribute_key: attribute at the event level - :param attribute_values: collection of attribute values - :param positive: decides if the values should be kept (positive=True) or removed (positive=False) - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param attribute_key: Attribute at the event level to filter. + :param attribute_values: Collection of attribute values to keep or remove. + :param positive: Determines whether the values should be kept (True) or removed (False). + :return: Filtered OCEL. .. code-block:: python3 import pm4py - filtered_ocel = pm4py.filter_ocel_event_attribute(ocel, 'ocel:activity', ['A', 'B', 'D']) + filtered_ocel = pm4py.filter_ocel_event_attribute( + ocel, + 'ocel:activity', + ['A', 'B', 'D'] + ) """ from pm4py.algo.filtering.ocel import event_attributes - return event_attributes.apply(ocel, attribute_values, parameters={event_attributes.Parameters.ATTRIBUTE_KEY: attribute_key, event_attributes.Parameters.POSITIVE: positive}) + return event_attributes.apply( + ocel, + attribute_values, + parameters={ + event_attributes.Parameters.ATTRIBUTE_KEY: attribute_key, + event_attributes.Parameters.POSITIVE: positive + } + ) -def filter_ocel_object_attribute(ocel: OCEL, attribute_key: str, attribute_values: Collection[Any], positive: bool = True) -> OCEL: +def filter_ocel_object_attribute( + ocel: OCEL, + attribute_key: str, + attribute_values: Collection[Any], + positive: bool = True +) -> OCEL: """ - Filters the object-centric event log on the provided object attributes values + Filters the object-centric event log based on the provided object attribute values. - :param ocel: object-centric event log - :param attribute_key: attribute at the event level - :param attribute_values: collection of attribute values - :param positive: decides if the values should be kept (positive=True) or removed (positive=False) - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param attribute_key: Attribute at the object level to filter. + :param attribute_values: Collection of attribute values to keep or remove. + :param positive: Determines whether the values should be kept (True) or removed (False). + :return: Filtered OCEL. .. code-block:: python3 import pm4py - filtered_ocel = pm4py.filter_ocel_object_attribute(ocel, 'ocel:type', ['order']) + filtered_ocel = pm4py.filter_ocel_object_attribute( + ocel, + 'ocel:type', + ['order'] + ) """ from pm4py.algo.filtering.ocel import object_attributes - return object_attributes.apply(ocel, attribute_values, parameters={object_attributes.Parameters.ATTRIBUTE_KEY: attribute_key, object_attributes.Parameters.POSITIVE: positive}) + return object_attributes.apply( + ocel, + attribute_values, + parameters={ + object_attributes.Parameters.ATTRIBUTE_KEY: attribute_key, + object_attributes.Parameters.POSITIVE: positive + } + ) -def filter_ocel_object_types_allowed_activities(ocel: OCEL, correspondence_dict: Dict[str, Collection[str]]) -> OCEL: +def filter_ocel_object_types_allowed_activities( + ocel: OCEL, + correspondence_dict: Dict[str, Collection[str]] +) -> OCEL: """ - Filters an object-centric event log keeping only the specified object types - with the specified activity set (filters out the rest). + Filters an object-centric event log, keeping only the specified object types with the specified set of allowed activities. - :param ocel: object-centric event log - :param correspondence_dict: dictionary containing, for every object type of interest, a collection of allowed activities. Example: {"order": ["Create Order"], "element": ["Create Order", "Create Delivery"]} - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param correspondence_dict: Dictionary containing, for every object type of interest, a collection of allowed activities. + Example: {"order": ["Create Order"], "element": ["Create Order", "Create Delivery"]}. + :return: Filtered OCEL. .. code-block:: python3 import pm4py - filtered_ocel = pm4py.filter_ocel_object_types_allowed_activities(ocel, {'order': ['create order', 'pay order'], 'item}) + filtered_ocel = pm4py.filter_ocel_object_types_allowed_activities( + ocel, + {'order': ['create order', 'pay order'], 'item': ['create item', 'deliver item']} + ) """ from pm4py.algo.filtering.ocel import activity_type_matching return activity_type_matching.apply(ocel, correspondence_dict) -def filter_ocel_object_per_type_count(ocel: OCEL, min_num_obj_type: Dict[str, int]) -> OCEL: +def filter_ocel_object_per_type_count( + ocel: OCEL, + min_num_obj_type: Dict[str, int] +) -> OCEL: """ - Filters the events of the object-centric logs which are related to at least - the specified amount of objects per type. + Filters the events of the object-centric logs that are related to at least the specified number of objects per type. - E.g. pm4py.filter_object_per_type_count(ocel, {"order": 1, "element": 2}) + Example: + pm4py.filter_object_per_type_count(ocel, {"order": 1, "element": 2}) Would keep the following events: @@ -840,98 +1083,144 @@ def filter_ocel_object_per_type_count(ocel: OCEL, min_num_obj_type: Dict[str, in 1 e11 1981-01-01 Create Order [i6, i5] [o2] 2 e14 1981-01-04 Create Order [i8, i7] [o3] - :param ocel: object-centric event log - :param min_num_obj_type: minimum number of objects per type - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param min_num_obj_type: Minimum number of objects per type. + :return: Filtered OCEL. .. code-block:: python3 import pm4py - filtered_ocel = pm4py.filter_ocel_object_per_type_count(ocel, {'order': 1, 'element': 2}) + filtered_ocel = pm4py.filter_ocel_object_per_type_count( + ocel, + {'order': 1, 'element': 2} + ) """ from pm4py.algo.filtering.ocel import objects_ot_count return objects_ot_count.apply(ocel, min_num_obj_type) -def filter_ocel_start_events_per_object_type(ocel: OCEL, object_type: str) -> OCEL: +def filter_ocel_start_events_per_object_type( + ocel: OCEL, + object_type: str +) -> OCEL: """ - Filters the events in which a new object for the given object type is spawn. - (E.g. an event with activity "Create Order" might spawn new orders). + Filters the events in which a new object of the given object type is spawned. + For example, an event with activity "Create Order" might spawn new orders. - :param ocel: object-centric event log - :param object_type: object type to consider - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param object_type: Object type to consider. + :return: Filtered OCEL. .. code-block:: python3 import pm4py - filtered_ocel = pm4py.filter_ocel_start_events_per_object_type(ocel, 'delivery') + filtered_ocel = pm4py.filter_ocel_start_events_per_object_type( + ocel, + 'delivery' + ) """ from pm4py.algo.filtering.ocel import ot_endpoints return ot_endpoints.filter_start_events_per_object_type(ocel, object_type) -def filter_ocel_end_events_per_object_type(ocel: OCEL, object_type: str) -> OCEL: +def filter_ocel_end_events_per_object_type( + ocel: OCEL, + object_type: str +) -> OCEL: """ - Filters the events in which an object for the given object type terminates its lifecycle. - (E.g. an event with activity "Pay Order" might terminate an order). + Filters the events in which an object of the given object type terminates its lifecycle. + For example, an event with activity "Pay Order" might terminate an order. - :param ocel: object-centric event log - :param object_type: object type to consider - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param object_type: Object type to consider. + :return: Filtered OCEL. .. code-block:: python3 import pm4py - filtered_ocel = pm4py.filter_ocel_end_events_per_object_type(ocel, 'delivery') + filtered_ocel = pm4py.filter_ocel_end_events_per_object_type( + ocel, + 'delivery' + ) """ from pm4py.algo.filtering.ocel import ot_endpoints return ot_endpoints.filter_end_events_per_object_type(ocel, object_type) -def filter_ocel_events_timestamp(ocel: OCEL, min_timest: Union[datetime.datetime, str], max_timest: Union[datetime.datetime, str], timestamp_key: str = "ocel:timestamp") -> OCEL: +def filter_ocel_events_timestamp( + ocel: OCEL, + min_timest: Union[datetime.datetime, str], + max_timest: Union[datetime.datetime, str], + timestamp_key: str = "ocel:timestamp" +) -> OCEL: """ - Filters the object-centric event log keeping events in the provided timestamp range + Filters the object-centric event log, keeping events within the provided timestamp range. - :param ocel: object-centric event log - :param min_timest: left extreme of the allowed timestamp interval (provided in the format: YYYY-mm-dd HH:MM:SS) - :param max_timest: right extreme of the allowed timestamp interval (provided in the format: YYYY-mm-dd HH:MM:SS) - :param timestamp_key: the attribute to use as timestamp (default: ocel:timestamp) - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param min_timest: Left extreme of the allowed timestamp interval (format: YYYY-mm-dd HH:MM:SS). + :param max_timest: Right extreme of the allowed timestamp interval (format: YYYY-mm-dd HH:MM:SS). + :param timestamp_key: The attribute to use as timestamp (default: ocel:timestamp). + :return: Filtered OCEL. .. code-block:: python3 import pm4py - filtered_ocel = pm4py.filter_ocel_events_timestamp(ocel, '1990-01-01 00:00:00', '2010-01-01 00:00:00') + filtered_ocel = pm4py.filter_ocel_events_timestamp( + ocel, + '1990-01-01 00:00:00', + '2010-01-01 00:00:00' + ) """ from pm4py.algo.filtering.ocel import event_attributes - return event_attributes.apply_timestamp(ocel, min_timest, max_timest, parameters={"pm4py:param:timestamp_key": timestamp_key}) - - -def filter_four_eyes_principle(log: Union[EventLog, pd.DataFrame], activity1: str, activity2: str, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", resource_key: str = "org:resource", keep_violations: bool = False) -> Union[EventLog, pd.DataFrame]: - """ - Filter out the cases of the log violating the four eyes principle on the provided activities. - - :param log: event log - :param activity1: first activity - :param activity2: second activity - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param resource_key: attribute to be used as resource - :param keep_violations: boolean to discard (if False) or retain (if True) the violations - :rtype: ``Union[EventLog, pd.DataFrame]`` + return event_attributes.apply_timestamp( + ocel, + min_timest, + max_timest, + parameters={"pm4py:param:timestamp_key": timestamp_key} + ) + + +def filter_four_eyes_principle( + log: Union[EventLog, pd.DataFrame], + activity1: str, + activity2: str, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + resource_key: str = "org:resource", + keep_violations: bool = False +) -> Union[EventLog, pd.DataFrame]: + """ + Filters out the cases of the log that violate the four-eyes principle on the provided activities. + + :param log: Event log or Pandas DataFrame. + :param activity1: First activity. + :param activity2: Second activity. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :param resource_key: Attribute to be used as resource. + :param keep_violations: Boolean indicating whether to discard (if False) or retain (if True) the violations. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_four_eyes_principle(dataframe, 'Act. A', 'Act. B', activity_key='concept:name', resource_key='org:resource', timestamp_key='time:timestamp', case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_four_eyes_principle( + dataframe, + 'Act. A', + 'Act. B', + activity_key='concept:name', + resource_key='org:resource', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -948,24 +1237,39 @@ def filter_four_eyes_principle(log: Union[EventLog, pd.DataFrame], activity1: st return ltl_checker.four_eyes_principle(log, activity1, activity2, parameters=properties) -def filter_activity_done_different_resources(log: Union[EventLog, pd.DataFrame], activity: str, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", resource_key: str = "org:resource", keep_violations: bool = True) -> Union[EventLog, pd.DataFrame]: +def filter_activity_done_different_resources( + log: Union[EventLog, pd.DataFrame], + activity: str, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + resource_key: str = "org:resource", + keep_violations: bool = True +) -> Union[EventLog, pd.DataFrame]: """ - Filters the cases where an activity is repeated by different resources. + Filters the cases where an activity is performed by different resources multiple times. - :param log: event log - :param activity: activity to consider - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param resource_key: attribute to be used as resource - :param keep_violations: boolean to discard (if False) or retain (if True) the violations - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param activity: Activity to consider. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :param resource_key: Attribute to be used as resource. + :param keep_violations: Boolean indicating whether to discard (if False) or retain (if True) the violations. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 import pm4py - filtered_dataframe = pm4py.filter_activity_done_different_resources(dataframe, 'Act. A', activity_key='concept:name', resource_key='org:resource', timestamp_key='time:timestamp', case_id_key='case:concept:name') + filtered_dataframe = pm4py.filter_activity_done_different_resources( + dataframe, + 'Act. A', + activity_key='concept:name', + resource_key='org:resource', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -982,28 +1286,35 @@ def filter_activity_done_different_resources(log: Union[EventLog, pd.DataFrame], return ltl_checker.attr_value_different_persons(log, activity, parameters=properties) -def filter_trace_segments(log: Union[EventLog, pd.DataFrame], admitted_traces: List[List[str]], positive: bool = True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: - """ - Filters an event log on a set of traces. A trace is a sequence of activities and "...", in which: - - a "..." before an activity tells that other activities can precede the given activity - - a "..." after an activity tells that other activities can follow the given activity - - For example: - - pm4py.filter_trace_segments(log, [["A", "B"]]) <- filters only the cases of the event log having exactly the process variant A,B - - pm4py.filter_trace_segments(log, [["...", "A", "B"]]) <- filters only the cases of the event log ending with the activities A,B - - pm4py.filter_trace_segments(log, [["A", "B", "..."]]) <- filters only the cases of the event log starting with the activities A,B - - pm4py.filter_trace_segments(log, [["...", "A", "B", "C", "..."], ["...", "D", "E", "F", "..."]] - <- filters only the cases of the event log in which at any point - there is A followed by B followed by C, and in which at any other point there is - D followed by E followed by F - - :param log: event log / Pandas dataframe - :param admitted_traces: collection of traces admitted from the filter (with the aforementioned criteria) - :param positive: (boolean) indicates if the filter should keep/discard the cases satisfying the filter - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` +def filter_trace_segments( + log: Union[EventLog, pd.DataFrame], + admitted_traces: List[List[str]], + positive: bool = True, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: + """ + Filters an event log based on a set of trace segments. A trace is a sequence of activities and "..." + where: + - "..." before an activity indicates that other activities can precede the given activity. + - "..." after an activity indicates that other activities can follow the given activity. + + Examples: + - pm4py.filter_trace_segments(log, [["A", "B"]]) retains only cases with the exact process variant A,B. + - pm4py.filter_trace_segments(log, [["...", "A", "B"]]) retains only cases ending with activities A,B. + - pm4py.filter_trace_segments(log, [["A", "B", "..."]]) retains only cases starting with activities A,B. + - pm4py.filter_trace_segments(log, [["...", "A", "B", "C", "..."], ["...", "D", "E", "F", "..."]]) retains cases where: + - At any point, there is A followed by B followed by C, + - And at any other point, there is D followed by E followed by F. + + :param log: Event log or Pandas DataFrame. + :param admitted_traces: Collection of trace segments to admit based on the criteria above. + :param positive: Boolean indicating whether to keep (if True) or discard (if False) the cases satisfying the filter. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. + :return: Filtered event log or Pandas DataFrame. .. code-block:: python3 @@ -1011,7 +1322,10 @@ def filter_trace_segments(log: Union[EventLog, pd.DataFrame], admitted_traces: L log = pm4py.read_xes("tests/input_data/running-example.xes") - filtered_log = pm4py.filter_trace_segments(log, [["...", "check ticket", "decide", "reinitiate request", "..."]]) + filtered_log = pm4py.filter_trace_segments( + log, + [["...", "check ticket", "decide", "reinitiate request", "..."]] + ) print(filtered_log) """ __event_log_deprecation_warning(log) @@ -1028,23 +1342,30 @@ def filter_trace_segments(log: Union[EventLog, pd.DataFrame], admitted_traces: L return trace_filter.apply(log, admitted_traces, parameters=parameters) -def filter_ocel_object_types(ocel: OCEL, obj_types: Collection[str], positive: bool = True, level: int = 1) -> OCEL: +def filter_ocel_object_types( + ocel: OCEL, + obj_types: Collection[str], + positive: bool = True, + level: int = 1 +) -> OCEL: """ Filters the object types of an object-centric event log. - :param ocel: object-centric event log - :param obj_types: object types to keep/remove - :param positive: boolean value (True=keep, False=remove) - :param level: recursively expand the set of object identifiers until the specified level - - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param obj_types: Object types to keep or remove. + :param positive: Boolean indicating whether to keep (True) or remove (False) the specified object types. + :param level: Recursively expands the set of object identifiers until the specified level. + :return: Filtered OCEL. .. code-block:: python3 import pm4py ocel = pm4py.read_ocel('log.jsonocel') - filtered_ocel = pm4py.filter_ocel_object_types(ocel, ['order']) + filtered_ocel = pm4py.filter_ocel_object_types( + ocel, + ['order'] + ) """ from copy import copy from pm4py.objects.ocel.util import filtering_utils @@ -1056,26 +1377,37 @@ def filter_ocel_object_types(ocel: OCEL, obj_types: Collection[str], positive: b filtered_ocel.objects = filtered_ocel.objects[~filtered_ocel.objects[filtered_ocel.object_type_column].isin(obj_types)] return filtering_utils.propagate_object_filtering(filtered_ocel) else: - object_ids = pandas_utils.format_unique(ocel.objects[ocel.objects[ocel.object_type_column].isin(obj_types)][ocel.object_id_column].unique()) + object_ids = pandas_utils.format_unique( + ocel.objects[ocel.objects[ocel.object_type_column].isin(obj_types)][ocel.object_id_column].unique() + ) return filter_ocel_objects(ocel, object_ids, level=level, positive=positive) -def filter_ocel_objects(ocel: OCEL, object_identifiers: Collection[str], positive: bool = True, level: int = 1) -> OCEL: +def filter_ocel_objects( + ocel: OCEL, + object_identifiers: Collection[str], + positive: bool = True, + level: int = 1 +) -> OCEL: """ Filters the object identifiers of an object-centric event log. - :param ocel: object-centric event log - :param object_identifiers: object identifiers to keep/remove - :param positive: boolean value (True=keep, False=remove) - :param level: recursively expand the set of object identifiers until the specified level - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param object_identifiers: Object identifiers to keep or remove. + :param positive: Boolean indicating whether to keep (True) or remove (False) the specified object identifiers. + :param level: Recursively expands the set of object identifiers until the specified level. + :return: Filtered OCEL. .. code-block:: python3 import pm4py ocel = pm4py.read_ocel('log.jsonocel') - filtered_ocel = pm4py.filter_ocel_objects(ocel, ['o1'], level=1) + filtered_ocel = pm4py.filter_ocel_objects( + ocel, + ['o1'], + level=1 + ) """ object_identifiers = set(object_identifiers) if level > 1: @@ -1098,35 +1430,42 @@ def filter_ocel_objects(ocel: OCEL, object_identifiers: Collection[str], positiv from pm4py.objects.ocel.util import filtering_utils filtered_ocel = copy(ocel) if positive: - filtered_ocel.objects = filtered_ocel.objects[filtered_ocel.objects[filtered_ocel.object_id_column].isin(object_identifiers)] + filtered_ocel.objects = filtered_ocel.objects[filtered_ocel.objects[ocel.object_id_column].isin(object_identifiers)] else: - filtered_ocel.objects = filtered_ocel.objects[~filtered_ocel.objects[filtered_ocel.object_id_column].isin(object_identifiers)] + filtered_ocel.objects = filtered_ocel.objects[~filtered_ocel.objects[ocel.object_id_column].isin(object_identifiers)] return filtering_utils.propagate_object_filtering(filtered_ocel) -def filter_ocel_events(ocel: OCEL, event_identifiers: Collection[str], positive: bool = True) -> OCEL: +def filter_ocel_events( + ocel: OCEL, + event_identifiers: Collection[str], + positive: bool = True +) -> OCEL: """ Filters the event identifiers of an object-centric event log. - :param ocel: object-centric event log - :param event_identifiers: event identifiers to keep/remove - :param positive: boolean value (True=keep, False=remove) - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param event_identifiers: Event identifiers to keep or remove. + :param positive: Boolean indicating whether to keep (True) or remove (False) the specified event identifiers. + :return: Filtered OCEL. .. code-block:: python3 import pm4py ocel = pm4py.read_ocel('log.jsonocel') - filtered_ocel = pm4py.filter_ocel_events(ocel, ['e1']) + filtered_ocel = pm4py.filter_ocel_events( + ocel, + ['e1'] + ) """ from copy import copy from pm4py.objects.ocel.util import filtering_utils filtered_ocel = copy(ocel) if positive: - filtered_ocel.events = filtered_ocel.events[filtered_ocel.events[filtered_ocel.event_id_column].isin(event_identifiers)] + filtered_ocel.events = filtered_ocel.events[filtered_ocel.events[ocel.event_id_column].isin(event_identifiers)] else: - filtered_ocel.events = filtered_ocel.events[~filtered_ocel.events[filtered_ocel.event_id_column].isin(event_identifiers)] + filtered_ocel.events = filtered_ocel.events[~filtered_ocel.events[ocel.event_id_column].isin(event_identifiers)] return filtering_utils.propagate_event_filtering(filtered_ocel) @@ -1156,23 +1495,30 @@ def filter_ocel_activities_connected_object_type(ocel: OCEL, object_type: str) - return filtering_utils.propagate_relations_filtering(filtered_ocel) -def filter_ocel_cc_object(ocel: OCEL, object_id: str, conn_comp: Optional[List[List[str]]] = None, return_conn_comp: bool = False) -> Union[OCEL, Tuple[OCEL, List[List[str]]]]: +def filter_ocel_cc_object( + ocel: OCEL, + object_id: str, + conn_comp: Optional[List[List[str]]] = None, + return_conn_comp: bool = False +) -> Union[OCEL, Tuple[OCEL, List[List[str]]]]: """ - Returns the connected component of the object-centric event log - to which the object with the provided identifier belongs. + Returns the connected component of the object-centric event log to which the specified object belongs. - :param ocel: object-centric event log - :param object_id: object identifier - :param conn_comp: (optional) connected components of the objects of the OCEL - :param return_conn_comp: if True, returns the computed connected components of the OCEL - :rtype: ``Union[OCEL, Tuple[OCEL, List[List[str]]]]`` + :param ocel: Object-centric event log. + :param object_id: Object identifier. + :param conn_comp: (Optional) Precomputed connected components of the OCEL objects. + :param return_conn_comp: If True, returns the filtered OCEL along with the computed connected components. + :return: Filtered OCEL, optionally with the list of connected components. .. code-block:: python3 import pm4py ocel = pm4py.read_ocel('log.jsonocel') - filtered_ocel = pm4py.filter_ocel_cc_object(ocel, 'order1') + filtered_ocel = pm4py.filter_ocel_cc_object( + ocel, + 'order1' + ) """ if conn_comp is None: from pm4py.algo.transformation.ocel.graphs import object_interaction_graph @@ -1198,25 +1544,33 @@ def filter_ocel_cc_object(ocel: OCEL, object_id: str, conn_comp: Optional[List[L return filter_ocel_objects(ocel, [object_id]) -def filter_ocel_cc_length(ocel: OCEL, min_cc_length: int, max_cc_length: int) -> OCEL: +def filter_ocel_cc_length( + ocel: OCEL, + min_cc_length: int, + max_cc_length: int +) -> OCEL: """ Keeps only the objects in an OCEL belonging to a connected component with a length - falling in a specified range + falling within the specified range. - Paper: - Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022. + Reference: + Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." + 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022. - :param ocel: object-centric event log - :param min_cc_length: minimum allowed length for the connected component - :param max_cc_length: maximum allowed length for the connected component - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param min_cc_length: Minimum allowed length for the connected component. + :param max_cc_length: Maximum allowed length for the connected component. + :return: Filtered OCEL. .. code-block:: python3 import pm4py - ocel = pm4py.read_ocel('log.jsonocel') - filtered_ocel = pm4py.filter_ocel_cc_length(ocel, 2, 10) + filtered_ocel = pm4py.filter_ocel_cc_length( + ocel, + 2, + 10 + ) """ from pm4py.algo.transformation.ocel.graphs import object_interaction_graph @@ -1233,30 +1587,41 @@ def filter_ocel_cc_length(ocel: OCEL, min_cc_length: int, max_cc_length: int) -> return filter_ocel_objects(ocel, objs) -def filter_ocel_cc_otype(ocel: OCEL, otype: str, positive: bool = True) -> OCEL: +def filter_ocel_cc_otype( + ocel: OCEL, + otype: str, + positive: bool = True +) -> OCEL: """ - Filters the objects belonging to the connected components having at least an object - of the provided object type. + Filters the objects belonging to connected components that have at least one object of the specified type. - Paper: - Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022. + Reference: + Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." + 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022. - :param ocel: object-centric event log - :param otype: object type - :param positive: boolean that keeps or discards the objects of these components - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param otype: Object type to consider. + :param positive: Boolean indicating whether to keep (True) or discard (False) the objects in these components. + :return: Filtered OCEL. .. code-block:: python3 import pm4py ocel = pm4py.read_ocel('log.jsonocel') - filtered_ocel = pm4py.filter_ocel_cc_otype(ocel, 'order') + filtered_ocel = pm4py.filter_ocel_cc_otype( + ocel, + 'order' + ) """ if positive: - objs = set(ocel.objects[ocel.objects[ocel.object_type_column] == otype][ocel.object_id_column]) + objs = set( + ocel.objects[ocel.objects[ocel.object_type_column] == otype][ocel.object_id_column] + ) else: - objs = set(ocel.objects[~(ocel.objects[ocel.object_type_column] == otype)][ocel.object_id_column]) + objs = set( + ocel.objects[~(ocel.objects[ocel.object_type_column] == otype)][ocel.object_id_column] + ) from pm4py.algo.transformation.ocel.graphs import object_interaction_graph @@ -1274,27 +1639,35 @@ def filter_ocel_cc_otype(ocel: OCEL, otype: str, positive: bool = True) -> OCEL: return filter_ocel_objects(ocel, objs) -def filter_ocel_cc_activity(ocel: OCEL, activity: str) -> OCEL: +def filter_ocel_cc_activity( + ocel: OCEL, + activity: str +) -> OCEL: """ - Filters the objects belonging to the connected components having at least an event - with the provided activity. + Filters the objects belonging to connected components that include at least one event with the specified activity. - Paper: - Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022. + Reference: + Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." + 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022. - :param ocel: object-centric event log - :param activity: activity - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :param activity: Activity to consider. + :return: Filtered OCEL. .. code-block:: python3 import pm4py ocel = pm4py.read_ocel('log.jsonocel') - filtered_ocel = pm4py.filter_ocel_cc_activity(ocel, 'Create Order') + filtered_ocel = pm4py.filter_ocel_cc_activity( + ocel, + 'Create Order' + ) """ evs = ocel.events[ocel.events[ocel.event_activity] == activity][ocel.event_id_column].to_numpy().tolist() - objs = pandas_utils.format_unique(ocel.relations[ocel.relations[ocel.event_id_column].isin(evs)][ocel.object_id_column].unique()) + objs = pandas_utils.format_unique( + ocel.relations[ocel.relations[ocel.event_id_column].isin(evs)][ocel.object_id_column].unique() + ) from pm4py.algo.transformation.ocel.graphs import object_interaction_graph diff --git a/pm4py/llm.py b/pm4py/llm.py index 9a9746bf2..2495cbc1f 100644 --- a/pm4py/llm.py +++ b/pm4py/llm.py @@ -35,17 +35,18 @@ def openai_query(prompt: str, api_key: Optional[str] = None, openai_model: Optio """ Executes the provided prompt, obtaining the answer from the OpenAI APIs. - :param prompt: prompt that should be executed - :param api_key: OpenAI API key - :param openai_model: OpenAI model to be used (default: gpt-3.5-turbo) - :param api_url: OpenAI API URL - :rtype: ``str`` + :param prompt: The prompt to be executed. + :param api_key: (Optional) OpenAI API key. + :param openai_model: (Optional) OpenAI model to be used (default: "gpt-3.5-turbo"). + :param api_url: (Optional) OpenAI API URL. + :param **kwargs: Additional parameters to pass to the OpenAI API. + :return: The response from the OpenAI API as a string. .. code-block:: python3 import pm4py - resp = pm4py.llm.openai_query('what is the result of 3+3?', api_key="sk-382393", openai_model="gpt-3.5-turbo") + resp = pm4py.llm.openai_query('What is the result of 3+3?', api_key="sk-382393", openai_model="gpt-3.5-turbo") print(resp) """ parameters = copy(kwargs) if kwargs is not None else {} @@ -62,19 +63,19 @@ def openai_query(prompt: str, api_key: Optional[str] = None, openai_model: Optio def abstract_dfg(log_obj: Union[pd.DataFrame, EventLog, EventStream], max_len: int = constants.OPENAI_MAX_LEN, include_performance: bool = True, relative_frequency: bool = False, response_header: bool = True, primary_performance_aggregation: str = "mean", secondary_performance_aggregation: Optional[str] = None, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> str: """ - Obtains the DFG abstraction of a traditional event log - - :param log_obj: log object - :param max_len: maximum length of the (string) abstraction - :param include_performance: (boolean) includes the performance of the paths in the abstraction - :param relative_frequency: (boolean) uses the relative instead of the absolute frequency of the paths - :param response_header: includes a short header before the paths, pointing to the description of the abstraction - :param primary_performance_aggregation: primary aggregation to be used for the arc's performance (default: mean, other options: median, min, max, sum, stdev) - :param secondary_performance_aggregation: (optional) secondary aggregation to be used for the arc's performance (default None, other options: mean, median, min, max, sum, stdev) - :param activity_key: the column to be used as activity - :param timestamp_key: the column to be used as timestamp - :param case_id_key: the column to be used as case identifier - :rtype: ``str`` + Obtains the DFG (Directly-Follows Graph) abstraction of a traditional event log. + + :param log_obj: The log object to abstract. + :param max_len: Maximum length of the string abstraction (default: constants.OPENAI_MAX_LEN). + :param include_performance: Whether to include the performance of the paths in the abstraction. + :param relative_frequency: Whether to use relative instead of absolute frequency of the paths. + :param response_header: Whether to include a short header before the paths, describing the abstraction. + :param primary_performance_aggregation: Primary aggregation method for the arc's performance (default: "mean"). Other options: "median", "min", "max", "sum", "stdev". + :param secondary_performance_aggregation: (Optional) Secondary aggregation method for the arc's performance (default: None). Other options: "mean", "median", "min", "max", "sum", "stdev". + :param activity_key: The column name to be used as activity. + :param timestamp_key: The column name to be used as timestamp. + :param case_id_key: The column name to be used as case identifier. + :return: The DFG abstraction as a string. .. code-block:: python3 @@ -100,19 +101,19 @@ def abstract_dfg(log_obj: Union[pd.DataFrame, EventLog, EventStream], max_len: i def abstract_variants(log_obj: Union[pd.DataFrame, EventLog, EventStream], max_len: int = constants.OPENAI_MAX_LEN, include_performance: bool = True, relative_frequency: bool = False, response_header: bool = True, primary_performance_aggregation: str = "mean", secondary_performance_aggregation: Optional[str] = None, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> str: """ - Obtains the variants abstraction of a traditional event log - - :param log_obj: log object - :param max_len: maximum length of the (string) abstraction - :param include_performance: (boolean) includes the performance of the variants in the abstraction - :param relative_frequency: (boolean) uses the relative instead of the absolute frequency of the variants - :param response_header: includes a short header before the variants, pointing to the description of the abstraction - :param primary_performance_aggregation: primary aggregation to be used for the arc's performance (default: mean, other options: median, min, max, sum, stdev) - :param secondary_performance_aggregation: (optional) secondary aggregation to be used for the arc's performance (default None, other options: mean, median, min, max, sum, stdev) - :param activity_key: the column to be used as activity - :param timestamp_key: the column to be used as timestamp - :param case_id_key: the column to be used as case identifier - :rtype: ``str`` + Obtains the variants abstraction of a traditional event log. + + :param log_obj: The log object to abstract. + :param max_len: Maximum length of the string abstraction (default: constants.OPENAI_MAX_LEN). + :param include_performance: Whether to include the performance of the variants in the abstraction. + :param relative_frequency: Whether to use relative instead of absolute frequency of the variants. + :param response_header: Whether to include a short header before the variants, describing the abstraction. + :param primary_performance_aggregation: Primary aggregation method for the variants' performance (default: "mean"). Other options: "median", "min", "max", "sum", "stdev". + :param secondary_performance_aggregation: (Optional) Secondary aggregation method for the variants' performance (default: None). Other options: "mean", "median", "min", "max", "sum", "stdev". + :param activity_key: The column name to be used as activity. + :param timestamp_key: The column name to be used as timestamp. + :param case_id_key: The column name to be used as case identifier. + :return: The variants abstraction as a string. .. code-block:: python3 @@ -138,11 +139,11 @@ def abstract_variants(log_obj: Union[pd.DataFrame, EventLog, EventStream], max_l def abstract_ocel(ocel: OCEL, include_timestamps: bool = True) -> str: """ - Obtains the abstraction of an object-centric event log, including the list of events and the objects of the OCEL + Obtains the abstraction of an object-centric event log, including the list of events and the objects of the OCEL. - :param ocel: object-centric event log - :param include_timestamps: (boolean) includes the timestamp information in the abstraction - :rtype: ``str`` + :param ocel: The object-centric event log to abstract. + :param include_timestamps: Whether to include timestamp information in the abstraction. + :return: The OCEL abstraction as a string. .. code-block:: python3 @@ -160,14 +161,13 @@ def abstract_ocel(ocel: OCEL, include_timestamps: bool = True) -> str: def abstract_ocel_ocdfg(ocel: OCEL, include_header: bool = True, include_timestamps: bool = True, max_len: int = constants.OPENAI_MAX_LEN) -> str: """ - Obtains the abstraction of an object-centric event log, representing in text the object-centric directly-follows - graph + Obtains the abstraction of an object-centric event log, representing the object-centric directly-follows graph in text. - :param ocel: object-centric event log - :param include_header: (boolean) includes the header in the abstraction - :param include_timestamps: (boolean) includes the timestamp information in the abstraction - :param max_len: maximum length of the abstraction - :rtype: ``str`` + :param ocel: The object-centric event log to abstract. + :param include_header: Whether to include a header in the abstraction. + :param include_timestamps: Whether to include timestamp information in the abstraction. + :param max_len: Maximum length of the abstraction (default: constants.OPENAI_MAX_LEN). + :return: The object-centric DFG abstraction as a string. .. code-block:: python3 @@ -187,22 +187,22 @@ def abstract_ocel_ocdfg(ocel: OCEL, include_header: bool = True, include_timesta def abstract_ocel_features(ocel: OCEL, obj_type: str, include_header: bool = True, max_len: int = constants.OPENAI_MAX_LEN, debug: bool = False, enable_object_lifecycle_paths: bool = True) -> str: """ - Obtains the abstraction of an object-centric event log, representing in text the features and their values. + Obtains the abstraction of an object-centric event log, representing the features and their values in text. - :param ocel: object-centric event log - :param obj_type: the object type that should be considered in the feature extraction - :param include_header: (boolean) includes the header in the abstraction - :param max_len: maximum length of the abstraction - :param debug: enables debugging mode (telling at which point of the feature extraction you are) - :param enable_object_lifecycle_paths: enables the "lifecycle paths" feature - :rtype: ``str`` + :param ocel: The object-centric event log to abstract. + :param obj_type: The object type to consider in feature extraction. + :param include_header: Whether to include a header in the abstraction. + :param max_len: Maximum length of the abstraction (default: constants.OPENAI_MAX_LEN). + :param debug: Enables debugging mode, providing insights into feature extraction steps. + :param enable_object_lifecycle_paths: Enables the "lifecycle paths" feature in the abstraction. + :return: The OCEL features abstraction as a string. .. code-block:: python3 import pm4py ocel = pm4py.read_ocel("tests/input_data/ocel/example_log.jsonocel") - print(pm4py.llm.abstract_ocel_ocdfg(ocel)) + print(pm4py.llm.abstract_ocel_features(ocel, obj_type="Resource")) """ parameters = {} parameters["include_header"] = include_header @@ -216,15 +216,15 @@ def abstract_ocel_features(ocel: OCEL, obj_type: str, include_header: bool = Tru def abstract_event_stream(log_obj: Union[pd.DataFrame, EventLog, EventStream], max_len: int = constants.OPENAI_MAX_LEN, response_header: bool = True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> str: """ - Obtains the event stream abstraction of a traditional event log + Obtains the event stream abstraction of a traditional event log. - :param log_obj: log object - :param max_len: maximum length of the (string) abstraction - :param response_header: includes a short header before the variants, pointing to the description of the abstraction - :param activity_key: the column to be used as activity - :param timestamp_key: the column to be used as timestamp - :param case_id_key: the column to be used as case identifier - :rtype: ``str`` + :param log_obj: The log object to abstract. + :param max_len: Maximum length of the string abstraction (default: constants.OPENAI_MAX_LEN). + :param response_header: Whether to include a short header before the event stream, describing the abstraction. + :param activity_key: The column name to be used as activity. + :param timestamp_key: The column name to be used as timestamp. + :param case_id_key: The column name to be used as case identifier. + :return: The event stream abstraction as a string. .. code-block:: python3 @@ -246,13 +246,13 @@ def abstract_event_stream(log_obj: Union[pd.DataFrame, EventLog, EventStream], m def abstract_petri_net(net: PetriNet, im: Marking, fm: Marking, response_header: bool = True) -> str: """ - Obtain an abstraction of a Petri net + Obtains an abstraction of a Petri net. - :param net: Petri net - :param im: Initial marking - :param fm: Final marking - :param response_header: includes the header of the response - :rtype: ``str`` + :param net: The Petri net to abstract. + :param im: The initial marking of the Petri net. + :param fm: The final marking of the Petri net. + :param response_header: Whether to include a header in the abstraction. + :return: The Petri net abstraction as a string. .. code-block:: python3 @@ -270,14 +270,14 @@ def abstract_petri_net(net: PetriNet, im: Marking, fm: Marking, response_header: def abstract_log_attributes(log_obj: Union[pd.DataFrame, EventLog, EventStream], max_len: int = constants.OPENAI_MAX_LEN, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> str: """ - Abstracts the attributes of a log (reporting their name, their type, and the top values) + Abstracts the attributes of a log by reporting their names, types, and top values. - :param log_obj: log object - :param max_len: maximum length of the (string) abstraction - :param activity_key: the column to be used as activity - :param timestamp_key: the column to be used as timestamp - :param case_id_key: the column to be used as case identifier - :rtype: ``str`` + :param log_obj: The log object whose attributes are to be abstracted. + :param max_len: Maximum length of the string abstraction (default: constants.OPENAI_MAX_LEN). + :param activity_key: The column name to be used as activity. + :param timestamp_key: The column name to be used as timestamp. + :param case_id_key: The column name to be used as case identifier. + :return: The log attributes abstraction as a string. .. code-block:: python3 @@ -298,14 +298,15 @@ def abstract_log_attributes(log_obj: Union[pd.DataFrame, EventLog, EventStream], def abstract_log_features(log_obj: Union[pd.DataFrame, EventLog, EventStream], max_len: int = constants.OPENAI_MAX_LEN, include_header: bool = True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> str: """ - Abstracts the machine learning features obtained from a log (reporting the top features until the desired length is obtained) + Abstracts the machine learning features obtained from a log by reporting the top features until the desired length is achieved. - :param log_obj: log object - :param max_len: maximum length of the (string) abstraction - :param activity_key: the column to be used as activity - :param timestamp_key: the column to be used as timestamp - :param case_id_key: the column to be used as case identifier - :rtype: ``str`` + :param log_obj: The log object from which to extract features. + :param max_len: Maximum length of the string abstraction (default: constants.OPENAI_MAX_LEN). + :param include_header: Whether to include a header in the abstraction. + :param activity_key: The column name to be used as activity. + :param timestamp_key: The column name to be used as timestamp. + :param case_id_key: The column name to be used as case identifier. + :return: The log features abstraction as a string. .. code-block:: python3 @@ -327,11 +328,11 @@ def abstract_log_features(log_obj: Union[pd.DataFrame, EventLog, EventStream], m def abstract_temporal_profile(temporal_profile: Dict[Tuple[str, str], Tuple[float, float]], include_header: bool = True) -> str: """ - Abstracts a temporal profile model to a string. + Abstracts a temporal profile model into a descriptive string. - :param temporal_profile: temporal profile model - :param include_header: includes an header in the response, describing the temporal profile - :rtype: ``str`` + :param temporal_profile: The temporal profile model to abstract. + :param include_header: Whether to include a header in the abstraction describing the temporal profile. + :return: The temporal profile abstraction as a string. .. code-block:: python3 @@ -351,16 +352,16 @@ def abstract_temporal_profile(temporal_profile: Dict[Tuple[str, str], Tuple[floa def abstract_case(case: Trace, include_case_attributes: bool = True, include_event_attributes: bool = True, include_timestamp: bool = True, include_header: bool = True, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp") -> str: """ - Textually abstracts a case + Textually abstracts a single case from an event log. - :param case: case object - :param include_case_attributes: (boolean) include or not the attributes at the case level - :param include_event_attributes: (boolean) include or not the attributes at the event level - :param include_timestamp: (boolean) include or not the event timestamp in the abstraction - :param include_header: (boolean) includes the header of the response - :param activity_key: the column to be used as activity - :param timestamp_key: the column to be used as timestamp - :rtype: ``str`` + :param case: The case object to abstract. + :param include_case_attributes: Whether to include attributes at the case level. + :param include_event_attributes: Whether to include attributes at the event level. + :param include_timestamp: Whether to include event timestamps in the abstraction. + :param include_header: Whether to include a header in the abstraction. + :param activity_key: The column name to be used as activity. + :param timestamp_key: The column name to be used as timestamp. + :return: The case abstraction as a string. .. code-block:: python3 @@ -383,11 +384,11 @@ def abstract_case(case: Trace, include_case_attributes: bool = True, include_eve def abstract_declare(declare_model, include_header: bool = True) -> str: """ - Textually abstracts a DECLARE model + Textually abstracts a DECLARE model. - :param declare: DECLARE model - :param include_header: (boolean) includes the header of the response - :rtype: ``str`` + :param declare_model: The DECLARE model to abstract. + :param include_header: Whether to include a header in the abstraction. + :return: The DECLARE model abstraction as a string. .. code-block:: python3 @@ -406,11 +407,11 @@ def abstract_declare(declare_model, include_header: bool = True) -> str: def abstract_log_skeleton(log_skeleton, include_header: bool = True) -> str: """ - Textually abstracts a log skeleton process model + Textually abstracts a log skeleton process model. - :param log_skeleton: log skeleton - :param include_header: (boolean) includes the header of the response - :rtype: ``str`` + :param log_skeleton: The log skeleton to abstract. + :param include_header: Whether to include a header in the abstraction. + :return: The log skeleton abstraction as a string. .. code-block:: python3 @@ -429,14 +430,13 @@ def abstract_log_skeleton(log_skeleton, include_header: bool = True) -> str: def explain_visualization(vis_saver, *args, connector=openai_query, **kwargs) -> str: """ - Explains a process mining visualization by using LLMs (saving that first in a .png image, then providing the .png file to the - Large Language Model along with possibly a description of the visualization). - - :param vis_saver: the visualizer (saving to disk) to be used - :param args: the mandatory arguments that should be provided to the visualization - :param connector: the connector method to the large language model - :param kwargs: optional parameters of the visualization or the connector (for example, the annotation of the visualization, or the API key) - :rtype: ``str`` + Explains a process mining visualization using LLMs by saving it as a .png image and providing the image to the Large Language Model along with a description. + + :param vis_saver: The visualizer function used to save the visualization to disk. + :param args: Positional arguments required by the visualizer function. + :param connector: (Optional) The connector method to communicate with the large language model (default: openai_query). + :param **kwargs: Additional keyword arguments for the visualizer function or the connector (e.g., annotations, API key). + :return: The explanation of the visualization as a string. .. code-block:: python3 diff --git a/pm4py/ml.py b/pm4py/ml.py index e35b75c93..f4d2ef072 100644 --- a/pm4py/ml.py +++ b/pm4py/ml.py @@ -20,7 +20,7 @@ Contact: info@processintelligence.solutions ''' __doc__ = """ -The ``pm4py.ml`` module contains the machine learning features offered in ``pm4py`` +The ``pm4py.ml`` module contains the machine learning features offered in ``pm4py``. """ from typing import Union, Tuple, Any, List, Collection, Optional @@ -34,15 +34,24 @@ from pm4py.utils import get_properties, constants, pandas_utils -def split_train_test(log: Union[EventLog, pd.DataFrame], train_percentage: float = 0.8, case_id_key="case:concept:name") -> Union[ - Tuple[EventLog, EventLog], Tuple[pd.DataFrame, pd.DataFrame]]: +def split_train_test( + log: Union[EventLog, pd.DataFrame], + train_percentage: float = 0.8, + case_id_key: str = "case:concept:name" +) -> Union[ + Tuple[EventLog, EventLog], + Tuple[pd.DataFrame, pd.DataFrame] +]: """ - Split an event log in a training log and a test log (for machine learning purposes). - Returns the training and the test event log. + Splits an event log into a training log and a test log for machine learning purposes. - :param log: event log / Pandas dataframe - :param train_percentage: fraction of traces to be included in the training log (from 0.0 to 1.0) - :param case_id_key: attribute to be used as case identifier + This function separates the provided log into two parts based on the specified training percentage. + It ensures that entire cases are included in either the training set or the test set. + + :param log: The event log or Pandas DataFrame to be split. + :param train_percentage: Fraction of cases to be included in the training log (between 0.0 and 1.0). + :param case_id_key: Attribute to be used as the case identifier. + :return: A tuple containing the training and test event logs or DataFrames. :rtype: ``Union[Tuple[EventLog, EventLog], Tuple[pd.DataFrame, pd.DataFrame]]`` .. code-block:: python3 @@ -72,15 +81,22 @@ def split_train_test(log: Union[EventLog, pd.DataFrame], train_percentage: float return split_train_test.split(log, train_percentage=train_percentage) -def get_prefixes_from_log(log: Union[EventLog, pd.DataFrame], length: int, case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: +def get_prefixes_from_log( + log: Union[EventLog, pd.DataFrame], + length: int, + case_id_key: str = "case:concept:name" +) -> Union[EventLog, pd.DataFrame]: """ - Gets the prefixes of a log of a given length. The returned log object contain the prefixes: - - if a trace has lower or identical length, it is included as-is - - if a trace has greater length, it is cut + Retrieves prefixes of traces in a log up to a specified length. + + The returned log contains prefixes of each trace: + - If a trace has a length less than or equal to the specified length, it is included as-is. + - If a trace exceeds the specified length, it is truncated to that length. - :param log: event log / Pandas dataframe - :param length: length - :param case_id_key: attribute to be used as case identifier + :param log: The event log or Pandas DataFrame from which to extract prefixes. + :param length: The maximum length of prefixes to extract. + :param case_id_key: Attribute to be used as the case identifier. + :return: A log containing the prefixes of the original log. :rtype: ``Union[EventLog, pd.DataFrame]`` .. code-block:: python3 @@ -95,114 +111,198 @@ def get_prefixes_from_log(log: Union[EventLog, pd.DataFrame], length: int, case_ check_pandas_dataframe_columns(log, case_id_key=case_id_key) from pm4py.util import pandas_utils log = pandas_utils.insert_ev_in_tr_index(log, case_id=case_id_key) - return log[log[constants.DEFAULT_INDEX_IN_TRACE_KEY] <= (length-1)] + return log[log[constants.DEFAULT_INDEX_IN_TRACE_KEY] <= (length - 1)] else: from pm4py.objects.log.util import get_prefixes return get_prefixes.get_prefixes_from_log(log, length) -def extract_outcome_enriched_dataframe(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", - timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", - start_timestamp_key: str = "time:timestamp") -> pd.DataFrame: +def extract_outcome_enriched_dataframe( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + start_timestamp_key: str = "time:timestamp" +) -> pd.DataFrame: """ - Inserts additional columns in the dataframe which are computed on the overall case, so they model the - outcome of the case. - - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param start_timestamp_key: attribute to be used as start timestamp + Enriches a dataframe with additional outcome-related columns computed from the entire case. + + This function adds columns that model the outcome of each case by computing metrics such as + arrival rates and service waiting times. + + :param log: The event log or Pandas DataFrame to be enriched. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :param start_timestamp_key: Attribute to be used as the start timestamp. + :return: An enriched Pandas DataFrame with additional outcome-related columns. :rtype: ``pd.DataFrame`` .. code-block:: python3 import pm4py - enriched_df = pm4py.extract_outcome_enriched_dataframe(log, activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name', start_timestamp_key='time:timestamp') - + enriched_df = pm4py.extract_outcome_enriched_dataframe( + log, + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name', + start_timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) + properties = get_properties( + log, + activity_key=activity_key, + case_id_key=case_id_key, + timestamp_key=timestamp_key + ) from pm4py.objects.conversion.log import converter as log_converter - log = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME, parameters=properties) + log = log_converter.apply( + log, + variant=log_converter.Variants.TO_DATA_FRAME, + parameters=properties + ) from pm4py.util import pandas_utils - fea_df = extract_features_dataframe(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key, include_case_id=True) - log2 = pandas_utils.insert_case_arrival_finish_rate(log.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key) - log2 = pandas_utils.insert_case_service_waiting_time(log2.copy(), timestamp_column=timestamp_key, case_id_column=case_id_key, start_timestamp_column=start_timestamp_key) - - return log2.merge(fea_df, left_on=case_id_key, right_on=case_id_key) - - -def extract_features_dataframe(log: Union[EventLog, pd.DataFrame], str_tr_attr=None, num_tr_attr=None, str_ev_attr=None, num_ev_attr=None, str_evsucc_attr=None, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, resource_key="org:resource", include_case_id: bool = False, **kwargs) -> pd.DataFrame: + fea_df = extract_features_dataframe( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + include_case_id=True + ) + log2 = pandas_utils.insert_case_arrival_finish_rate( + log.copy(), + timestamp_column=timestamp_key, + case_id_column=case_id_key, + start_timestamp_column=start_timestamp_key + ) + log2 = pandas_utils.insert_case_service_waiting_time( + log2.copy(), + timestamp_column=timestamp_key, + case_id_column=case_id_key, + start_timestamp_column=start_timestamp_key + ) + + return log2.merge(fea_df, on=case_id_key) + + +def extract_features_dataframe( + log: Union[EventLog, pd.DataFrame], + str_tr_attr: Optional[List[str]] = None, + num_tr_attr: Optional[List[str]] = None, + str_ev_attr: Optional[List[str]] = None, + num_ev_attr: Optional[List[str]] = None, + str_evsucc_attr: Optional[List[str]] = None, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: Optional[str] = None, + resource_key: str = "org:resource", + include_case_id: bool = False, + **kwargs +) -> pd.DataFrame: """ - Extracts a dataframe containing the features of each case of the provided log object - - :param log: log object (event log / Pandas dataframe) - :param str_tr_attr: (if provided) string attributes at the case level which should be extracted as features - :param num_tr_attr: (if provided) numeric attributes at the case level which should be extracted as features - :param str_ev_attr: (if provided) string attributes at the event level which should be extracted as features (one-hot encoding) - :param num_ev_attr: (if provided) numeric attributes at the event level which should be extracted as features (last value per attribute in a case) - :param activity_key: the attribute to be used as activity - :param timestamp_key: the attribute to be used as timestamp - :param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier - :param resource_key: the attribute to be used as resource - :param include_case_id: includes the case identifier column in the features table + Extracts a dataframe containing features for each case in the provided log object. + + This function processes the log to generate a set of features that can be used for machine learning tasks. + Features can include both case-level and event-level attributes, with options for one-hot encoding. + + :param log: The event log or Pandas DataFrame from which to extract features. + :param str_tr_attr: (Optional) List of string attributes at the case level to extract as features. + :param num_tr_attr: (Optional) List of numeric attributes at the case level to extract as features. + :param str_ev_attr: (Optional) List of string attributes at the event level to extract as features (one-hot encoded). + :param num_ev_attr: (Optional) List of numeric attributes at the event level to extract as features + (uses the last value per attribute in a case). + :param str_evsucc_attr: (Optional) List of string successor attributes at the event level to extract as features. + :param activity_key: Attribute to be used as the activity identifier. + :param timestamp_key: Attribute to be used for timestamps. + :param case_id_key: (Optional) Attribute to be used as the case identifier. If not provided, the default is used. + :param resource_key: Attribute to be used as the resource identifier. + :param include_case_id: Whether to include the case identifier column in the features table. + :param **kwargs: Additional keyword arguments to pass to the feature extraction algorithm. + :return: A Pandas DataFrame containing the extracted features for each case. :rtype: ``pd.DataFrame`` .. code-block:: python3 import pm4py - features_df = pm4py.extract_features_dataframe(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + features_df = pm4py.extract_features_dataframe( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - parameters = {} - if kwargs is not None: - parameters = kwargs + parameters = kwargs if kwargs else {} - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) for prop in properties: parameters[prop] = properties[prop] - parameters["str_tr_attr"] = str_tr_attr - parameters["num_tr_attr"] = num_tr_attr - parameters["str_ev_attr"] = str_ev_attr - parameters["num_ev_attr"] = num_ev_attr - parameters["str_evsucc_attr"] = str_evsucc_attr + parameters["str_tr_attr"] = str_tr_attr or [] + parameters["num_tr_attr"] = num_tr_attr or [] + parameters["str_ev_attr"] = str_ev_attr or [] + parameters["num_ev_attr"] = num_ev_attr or [] + parameters["str_evsucc_attr"] = str_evsucc_attr or [] parameters["add_case_identifier_column"] = include_case_id from pm4py.algo.transformation.log_to_features import algorithm as log_to_features if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, case_id_key=case_id_key, timestamp_key=timestamp_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + case_id_key=case_id_key, + timestamp_key=timestamp_key + ) data, feature_names = log_to_features.apply(log, parameters=parameters) return pandas_utils.instantiate_dataframe(data, columns=feature_names) -def extract_ocel_features(ocel: OCEL, obj_type: str, enable_object_lifecycle_paths: bool = True, enable_object_work_in_progress: bool = False, object_str_attributes: Optional[Collection[str]] = None, object_num_attributes: Optional[Collection[str]] = None, include_obj_id: bool = False, debug: bool = False) -> pd.DataFrame: +def extract_ocel_features( + ocel: OCEL, + obj_type: str, + enable_object_lifecycle_paths: bool = True, + enable_object_work_in_progress: bool = False, + object_str_attributes: Optional[Collection[str]] = None, + object_num_attributes: Optional[Collection[str]] = None, + include_obj_id: bool = False, + debug: bool = False +) -> pd.DataFrame: """ - Extracts from an object-centric event log a set of features (returned as dataframe) computed on the OCEL - for the objects of a given object type. - - Implements the approach described in: - Berti, A., Herforth, J., Qafari, M.S. et al. Graph-based feature extraction on object-centric event logs. Int J Data Sci Anal (2023). https://doi.org/10.1007/s41060-023-00428-2 - - :param ocel: object-centric event log - :param obj_type: object type that should be considered - :param enable_object_lifecycle_paths: enables the "lifecycle paths" feature - :param enable_object_work_in_progress: enables the "work in progress" feature (which has an high computational cost) - :param object_str_attributes: string attributes at the object level to one-hot encode during the feature extraction - :param object_num_attributes: numeric attributes at the object level to one-hot encode during the feature extraction - :param include_obj_id: includes the object identifier as column of the "features" dataframe - :param debug: enables debugging mode (telling at which point of the feature extraction you are) + Extracts a set of features from an object-centric event log (OCEL) for objects of a specified type. + + This function computes various features based on the lifecycle paths and work-in-progress metrics + of objects within the OCEL. It also supports encoding of string and numeric object attributes. + + The approach is based on: + Berti, A., Herforth, J., Qafari, M.S. et al. Graph-based feature extraction on object-centric event logs. + Int J Data Sci Anal (2023). https://doi.org/10.1007/s41060-023-00428-2 + + :param ocel: The object-centric event log from which to extract features. + :param obj_type: The object type to consider for feature extraction. + :param enable_object_lifecycle_paths: Whether to enable the "lifecycle paths" feature. + :param enable_object_work_in_progress: Whether to enable the "work in progress" feature, + which has a high computational cost. + :param object_str_attributes: (Optional) Collection of string attributes at the object level to one-hot encode. + :param object_num_attributes: (Optional) Collection of numeric attributes at the object level to encode. + :param include_obj_id: Whether to include the object identifier as a column in the features DataFrame. + :param debug: Whether to enable debugging mode to track the feature extraction process. + :return: A Pandas DataFrame containing the extracted features for the specified object type. :rtype: ``pd.DataFrame`` .. code-block:: python3 @@ -218,15 +318,16 @@ def extract_ocel_features(ocel: OCEL, obj_type: str, enable_object_lifecycle_pat if object_num_attributes is None: object_num_attributes = [] - parameters = {} - parameters["filter_per_type"] = obj_type - parameters["enable_object_lifecycle_paths"] = enable_object_lifecycle_paths - parameters["enable_object_work_in_progress"] = enable_object_work_in_progress - parameters["enable_object_str_attributes"] = len(object_str_attributes) > 0 - parameters["enable_object_num_attributes"] = len(object_num_attributes) > 0 - parameters["str_obj_attr"] = object_str_attributes - parameters["num_obj_attr"] = object_num_attributes - parameters["debug"] = debug + parameters = { + "filter_per_type": obj_type, + "enable_object_lifecycle_paths": enable_object_lifecycle_paths, + "enable_object_work_in_progress": enable_object_work_in_progress, + "enable_object_str_attributes": bool(object_str_attributes), + "enable_object_num_attributes": bool(object_num_attributes), + "str_obj_attr": object_str_attributes, + "num_obj_attr": object_num_attributes, + "debug": debug + } from pm4py.algo.transformation.ocel.features.objects import algorithm as ocel_feature_extraction @@ -238,37 +339,66 @@ def extract_ocel_features(ocel: OCEL, obj_type: str, enable_object_lifecycle_pat if include_obj_id: objects_with_type = ocel.objects[[ocel.object_id_column, ocel.object_type_column]].to_dict("records") - objects_with_type = [x[ocel.object_id_column] for x in objects_with_type if x[ocel.object_type_column] == obj_type] + objects_with_type = [ + x[ocel.object_id_column] for x in objects_with_type + if x[ocel.object_type_column] == obj_type + ] dataframe[ocel.object_id_column] = objects_with_type return dataframe -def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grouper_freq="W", activity_key="concept:name", timestamp_key="time:timestamp", case_id_key=None, start_timestamp_key="time:timestamp", resource_key="org:resource") -> pd.DataFrame: +def extract_temporal_features_dataframe( + log: Union[EventLog, pd.DataFrame], + grouper_freq: str = "W", + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: Optional[str] = None, + start_timestamp_key: str = "time:timestamp", + resource_key: str = "org:resource" +) -> pd.DataFrame: """ - Extracts a dataframe containing the temporal features of the provided log object - - Implements the approach described in the paper: - Pourbafrani, Mahsa, Sebastiaan J. van Zelst, and Wil MP van der Aalst. "Supporting automatic system dynamics model generation for simulation in the context of process mining." International Conference on Business Information Systems. Springer, Cham, 2020. - - :param log: log object (event log / Pandas dataframe) - :param grouper_freq: the grouping frequency (D, W, M, Y) to use - :param activity_key: the attribute to be used as activity - :param timestamp_key: the attribute to be used as timestamp - :param case_id_key: (if provided, otherwise default) the attribute to be used as case identifier - :param resource_key: the attribute to be used as resource - :param start_timestamp_key: the attribute to be used as start timestamp + Extracts temporal features from a log object and returns them as a dataframe. + + This function computes temporal metrics based on the specified grouping frequency, which can be + daily (D), weekly (W), monthly (M), or yearly (Y). These features are useful for analyzing + system dynamics and simulation in the context of process mining. + + The approach is based on: + Pourbafrani, Mahsa, Sebastiaan J. van Zelst, and Wil MP van der Aalst. + "Supporting automatic system dynamics model generation for simulation in the context of process mining." + International Conference on Business Information Systems. Springer, Cham, 2020. + + :param log: The event log or Pandas DataFrame from which to extract temporal features. + :param grouper_freq: The frequency to use for grouping (e.g., 'D' for daily, 'W' for weekly, + 'M' for monthly, 'Y' for yearly). + :param activity_key: Attribute to be used as the activity identifier. + :param timestamp_key: Attribute to be used for timestamps. + :param case_id_key: (Optional) Attribute to be used as the case identifier. If not provided, the default is used. + :param start_timestamp_key: Attribute to be used as the start timestamp. + :param resource_key: Attribute to be used as the resource identifier. + :return: A Pandas DataFrame containing the extracted temporal features. :rtype: ``pd.DataFrame`` .. code-block:: python3 import pm4py - temporal_features_df = pm4py.extract_temporal_features_dataframe(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + temporal_features_df = pm4py.extract_temporal_features_dataframe( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + parameters = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) from pm4py.algo.transformation.log_to_features.variants import temporal @@ -283,39 +413,79 @@ def extract_temporal_features_dataframe(log: Union[EventLog, pd.DataFrame], grou return temporal.apply(log, parameters=parameters) -def extract_target_vector(log: Union[EventLog, pd.DataFrame], variant: str, activity_key="concept:name", timestamp_key="time:timestamp", case_id_key="case:concept:name") -> Tuple[Any, List[str]]: +def extract_target_vector( + log: Union[EventLog, pd.DataFrame], + variant: str, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name" +) -> Tuple[Any, List[str]]: """ - Extracts from a log object the target vector for a specific ML use case - (next activity, next time, remaining time) - - :param log: log object (event log / Pandas dataframe) - :param variant: variant of the algorithm to be used: next_activity, next_time, remaining_time - :param activity_key: the attribute to be used as activity - :param timestamp_key: the attribute to be used as timestamp - :param case_id_key: the attribute to be used as case identifier + Extracts the target vector from a log object for a specific machine learning use case. + + Supported variants include: + - 'next_activity': Predicts the next activity in a case. + - 'next_time': Predicts the timestamp of the next activity. + - 'remaining_time': Predicts the remaining time for the case. + + :param log: The event log or Pandas DataFrame from which to extract the target vector. + :param variant: The variant of the algorithm to use. Must be one of: + 'next_activity', 'next_time', 'remaining_time'. + :param activity_key: Attribute to be used as the activity identifier. + :param timestamp_key: Attribute to be used for timestamps. + :param case_id_key: Attribute to be used as the case identifier. + :return: A tuple containing the target vector and a list of class labels (if applicable). :rtype: ``Tuple[Any, List[str]]`` + :raises Exception: If an unsupported variant is provided. + .. code-block:: python3 import pm4py - vector_next_act, class_next_act = pm4py.extract_target_vector(log, 'next_activity', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') - vector_next_time, class_next_time = pm4py.extract_target_vector(log, 'next_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') - vector_rem_time, class_rem_time = pm4py.extract_target_vector(log, 'remaining_time', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') - + vector_next_act, class_next_act = pm4py.extract_target_vector( + log, + 'next_activity', + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) + vector_next_time, class_next_time = pm4py.extract_target_vector( + log, + 'next_time', + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) + vector_rem_time, class_rem_time = pm4py.extract_target_vector( + log, + 'remaining_time', + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) - parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + parameters = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key + ) from pm4py.algo.transformation.log_to_target import algorithm as log_to_target - var_map = {"next_activity": log_to_target.Variants.NEXT_ACTIVITY, "next_time": log_to_target.Variants.NEXT_TIME, - "remaining_time": log_to_target.Variants.REMAINING_TIME} + var_map = { + "next_activity": log_to_target.Variants.NEXT_ACTIVITY, + "next_time": log_to_target.Variants.NEXT_TIME, + "remaining_time": log_to_target.Variants.REMAINING_TIME + } if variant not in var_map: raise Exception( - "please provide the variant between: next_activity, next_time, remaining_time") + "Please provide the variant as one of the following: 'next_activity', 'next_time', 'remaining_time'." + ) target, classes = log_to_target.apply(log, variant=var_map[variant], parameters=parameters) return target, classes diff --git a/pm4py/ocel.py b/pm4py/ocel.py index 102fe6b36..96f14d330 100644 --- a/pm4py/ocel.py +++ b/pm4py/ocel.py @@ -20,7 +20,7 @@ Contact: info@processintelligence.solutions ''' __doc__ = """ -The ``pm4py.ocel`` module contains the object-centric process mining features offered in ``pm4py`` +The ``pm4py.ocel`` module contains the object-centric process mining features offered in ``pm4py``. """ from typing import List, Dict, Collection, Any, Optional, Set, Tuple @@ -35,11 +35,13 @@ def ocel_get_object_types(ocel: OCEL) -> List[str]: """ - Gets the list of object types contained in the object-centric event log + Returns the list of object types contained in the object-centric event log (e.g., ["order", "item", "delivery"]). - :param ocel: object-centric event log - :rtype: ``List[str]`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: List of object types. + :rtype: List[str] .. code-block:: python3 @@ -52,11 +54,13 @@ def ocel_get_object_types(ocel: OCEL) -> List[str]: def ocel_get_attribute_names(ocel: OCEL) -> List[str]: """ - Gets the list of attributes at the event and the object level of an object-centric event log - (e.g. ["cost", "amount", "name"]) + Returns the list of attributes at the event and object levels of an object-centric event log + (e.g., ["cost", "amount", "name"]). - :param ocel: object-centric event log - :rtype: ``List[str]`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: List of attribute names. + :rtype: List[str] .. code-block:: python3 @@ -70,17 +74,20 @@ def ocel_get_attribute_names(ocel: OCEL) -> List[str]: def ocel_flattening(ocel: OCEL, object_type: str) -> pd.DataFrame: """ - Flattens the object-centric event log to a traditional event log with the choice of an object type. - In the flattened log, the objects of a given object type are the cases, and each case - contains the set of events related to the object. - The flattened log follows the XES notations for case identifier, activity, and timestamp. In particular: - - "case:concept:name" is the column used for the case ID. - - "concept:name" is the column used for the activity. - - "time:timestamp" is the column used for the timestamp. - - :param ocel: object-centric event log - :param object_type: object type - :rtype: ``pd.DataFrame`` + Flattens the object-centric event log to a traditional event log based on a chosen object type. + In the flattened log, the objects of the specified type are treated as cases, and each case + contains the set of events related to that object. + The flattened log follows the XES notations for case identifier, activity, and timestamp. Specifically: + - "case:concept:name" is used for the case ID. + - "concept:name" is used for the activity. + - "time:timestamp" is used for the timestamp. + + :param ocel: Object-centric event log. + :type ocel: OCEL + :param object_type: The object type to use as cases. + :type object_type: str + :return: Flattened traditional event log. + :rtype: pd.DataFrame .. code-block:: python3 @@ -94,10 +101,12 @@ def ocel_flattening(ocel: OCEL, object_type: str) -> pd.DataFrame: def ocel_object_type_activities(ocel: OCEL) -> Dict[str, Collection[str]]: """ - Gets the set of activities performed for each object type + Returns the set of activities performed for each object type. - :param ocel: object-centric event log - :rtype: ``Dict[str, Collection[str]]`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: Dictionary mapping object types to their associated activities. + :rtype: Dict[str, Collection[str]] .. code-block:: python3 @@ -112,10 +121,12 @@ def ocel_object_type_activities(ocel: OCEL) -> Dict[str, Collection[str]]: def ocel_objects_ot_count(ocel: OCEL) -> Dict[str, Dict[str, int]]: """ - Counts for each event the number of related objects per type + Returns the count of related objects per type for each event. - :param ocel: object-centric event log - :rtype: ``Dict[str, Dict[str, int]]`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: Nested dictionary mapping events to object types and their counts. + :rtype: Dict[str, Dict[str, int]] .. code-block:: python3 @@ -130,12 +141,14 @@ def ocel_objects_ot_count(ocel: OCEL) -> Dict[str, Dict[str, int]]: def ocel_temporal_summary(ocel: OCEL) -> pd.DataFrame: """ - Returns the ``temporal summary'' from an object-centric event log. - The temporal summary aggregates all the events performed in the same timestamp, - and reports the list of activities and the involved objects. + Returns the temporal summary of an object-centric event log. + The temporal summary aggregates all events that occur at the same timestamp + and reports the list of activities and involved objects. - :param ocel: object-centric event log - :rtype: ``pd.DataFrame`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: Temporal summary DataFrame. + :rtype: pd.DataFrame .. code-block:: python3 @@ -152,10 +165,12 @@ def ocel_temporal_summary(ocel: OCEL) -> pd.DataFrame: def ocel_objects_summary(ocel: OCEL) -> pd.DataFrame: """ - Gets the objects summary of an object-centric event log + Returns the objects summary of an object-centric event log. - :param ocel: object-centric event log - :rtype: ``pd.DataFrame`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: Objects summary DataFrame containing lifecycle information and interacting objects. + :rtype: pd.DataFrame .. code-block:: python3 @@ -164,13 +179,30 @@ def ocel_objects_summary(ocel: OCEL) -> pd.DataFrame: objects_summary = pm4py.ocel_objects_summary(ocel) """ gdf = ocel.relations.groupby(ocel.object_id_column) - act_comb = gdf[ocel.event_activity].agg(list).to_frame().rename(columns={ocel.event_activity: "activities_lifecycle"}) - lif_start_tim = gdf[ocel.event_timestamp].min().to_frame().rename(columns={ocel.event_timestamp: "lifecycle_start"}) - lif_end_tim = gdf[ocel.event_timestamp].max().to_frame().rename(columns={ocel.event_timestamp: "lifecycle_end"}) + act_comb = ( + gdf[ocel.event_activity] + .agg(list) + .to_frame() + .rename(columns={ocel.event_activity: "activities_lifecycle"}) + ) + lif_start_tim = ( + gdf[ocel.event_timestamp] + .min() + .to_frame() + .rename(columns={ocel.event_timestamp: "lifecycle_start"}) + ) + lif_end_tim = ( + gdf[ocel.event_timestamp] + .max() + .to_frame() + .rename(columns={ocel.event_timestamp: "lifecycle_end"}) + ) objects_summary = act_comb.join(lif_start_tim) objects_summary = objects_summary.join(lif_end_tim) objects_summary = objects_summary.reset_index() - objects_summary["lifecycle_duration"] = pandas_utils.get_total_seconds(objects_summary["lifecycle_end"] - objects_summary["lifecycle_start"]) + objects_summary["lifecycle_duration"] = pandas_utils.get_total_seconds( + objects_summary["lifecycle_end"] - objects_summary["lifecycle_start"] + ) ev_rel_obj = ocel.relations.groupby(ocel.event_id_column)[ocel.object_id_column].agg(list).to_dict() objects_ids = pandas_utils.format_unique(ocel.objects[ocel.object_id_column].unique()) graph = {o: set() for o in objects_ids} @@ -186,12 +218,14 @@ def ocel_objects_summary(ocel: OCEL) -> pd.DataFrame: def ocel_objects_interactions_summary(ocel: OCEL) -> pd.DataFrame: """ - Gets the objects interactions summary of an object-centric event log. - The objects interactions summary has a row for every combination (event, related object, other related object). - Properties such as the activity of the event, and the object types of the two related objects, are included. + Returns the objects interactions summary of an object-centric event log. + The summary includes a row for every combination of (event, related object, other related object). + Properties such as the activity of the event and the object types of the two related objects are included. - :param ocel: object-centric event log - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: Objects interactions summary DataFrame. + :rtype: pd.DataFrame .. code-block:: python3 @@ -207,28 +241,45 @@ def ocel_objects_interactions_summary(ocel: OCEL) -> pd.DataFrame: rel_obj = ev_rel_obj[ev] for o1 in rel_obj: for o2 in rel_obj: - if o1 != o2: - stream.append({ocel.event_id_column: ev, ocel.event_activity: eve_activities[ev], - ocel.object_id_column: o1, ocel.object_type_column: obj_types[o1], - ocel.object_id_column+"_2": o2, ocel.object_type_column+"_2": obj_types[o2]}) + if o1 != o2: + stream.append( + { + ocel.event_id_column: ev, + ocel.event_activity: eve_activities[ev], + ocel.object_id_column: o1, + ocel.object_type_column: obj_types[o1], + f"{ocel.object_id_column}_2": o2, + f"{ocel.object_type_column}_2": obj_types[o2], + } + ) return pandas_utils.instantiate_dataframe(stream) -def discover_ocdfg(ocel: OCEL, business_hours=False, business_hour_slots=constants.DEFAULT_BUSINESS_HOUR_SLOTS) -> Dict[str, Any]: +def discover_ocdfg(ocel: OCEL, business_hours: bool = False, business_hour_slots: Optional[List[Tuple[int, int]]] = constants.DEFAULT_BUSINESS_HOUR_SLOTS) -> Dict[str, Any]: """ - Discovers an OC-DFG from an object-centric event log. + Discovers an Object-Centric Directly-Follows Graph (OC-DFG) from an object-centric event log. - Object-centric directly-follows multigraphs are a composition of directly-follows graphs for the single object type, which can be annotated with different metrics considering the entities of an object-centric event log (i.e., events, unique objects, total objects). + Object-centric directly-follows multigraphs are a composition of directly-follows graphs for each object type. + These graphs can be annotated with different metrics considering the entities of an object-centric event log + (i.e., events, unique objects, total objects). Reference paper: - Berti, Alessandro, and Wil van der Aalst. "Extracting multiple viewpoint models from relational databases." Data-Driven Process Discovery and Analysis. Springer, Cham, 2018. 24-51. - - :param ocel: object-centric event log - :param business_hours: boolean value that enables the usage of the business hours - :param business_hour_slots: work schedule of the company, provided as a list of tuples where each tuple represents one time slot of business hours. One slot i.e. one tuple consists of one start and one end time given in seconds since week start, e.g. [(7 * 60 * 60, 17 * 60 * 60), ((24 + 7) * 60 * 60, (24 + 12) * 60 * 60), ((24 + 13) * 60 * 60, (24 + 17) * 60 * 60),] meaning that business hours are Mondays 07:00 - 17:00 and Tuesdays 07:00 - 12:00 and 13:00 - 17:00 - - :rtype: ``Dict[str, Any]`` + Berti, Alessandro, and Wil van der Aalst. "Extracting multiple viewpoint models from relational databases." + Data-Driven Process Discovery and Analysis. Springer, Cham, 2018. 24-51. + + :param ocel: Object-centric event log. + :type ocel: OCEL + :param business_hours: Enable the usage of business hours if set to True. + :type business_hours: bool + :param business_hour_slots: Work schedule of the company, provided as a list of tuples where each tuple + represents one time slot of business hours. Each tuple consists of a start + and an end time given in seconds since week start, e.g., + [(25200, 61200), (9072, 43200), (46800, 61200)] meaning that business hours + are Mondays 07:00 - 17:00, Tuesdays 02:32 - 12:00, and Wednesdays 13:00 - 17:00. + :type business_hour_slots: Optional[List[Tuple[int, int]]] + :return: OC-DFG discovery result. + :rtype: Dict[str, Any] .. code-block:: python3 @@ -236,9 +287,10 @@ def discover_ocdfg(ocel: OCEL, business_hours=False, business_hour_slots=constan ocdfg = pm4py.discover_ocdfg(ocel) """ - parameters = {} - parameters["business_hours"] = business_hours - parameters["business_hour_slots"] = business_hour_slots + parameters = { + "business_hours": business_hours, + "business_hour_slots": business_hour_slots, + } from pm4py.algo.discovery.ocel.ocdfg import algorithm as ocdfg_discovery return ocdfg_discovery.apply(ocel, parameters=parameters) @@ -247,13 +299,17 @@ def discover_oc_petri_net(ocel: OCEL, inductive_miner_variant: str = "im", diagn """ Discovers an object-centric Petri net from the provided object-centric event log. - Reference paper: van der Aalst, Wil MP, and Alessandro Berti. "Discovering object-centric Petri nets." Fundamenta informaticae 175.1-4 (2020): 1-40. + Reference paper: van der Aalst, Wil MP, and Alessandro Berti. + "Discovering object-centric Petri nets." Fundamenta Informaticae 175.1-4 (2020): 1-40. - :param ocel: object-centric event log - :param inductive_miner_variant: specify the variant of the inductive miner to be used - ("im" for traditional; "imd" for the faster inductive miner directly-follows) - :param diagnostics_with_tbr: (boolean) enables the computation of some diagnostics using token-based replay - :rtype: ``Dict[str, Any]`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :param inductive_miner_variant: Variant of the inductive miner to use ("im" for traditional; "imd" for the faster inductive miner directly-follows). + :type inductive_miner_variant: str + :param diagnostics_with_tbr: Enable the computation of diagnostics using token-based replay if set to True. + :type diagnostics_with_tbr: bool + :return: Discovered object-centric Petri net. + :rtype: Dict[str, Any] .. code-block:: python3 @@ -262,27 +318,40 @@ def discover_oc_petri_net(ocel: OCEL, inductive_miner_variant: str = "im", diagn ocpn = pm4py.discover_oc_petri_net(ocel) """ from pm4py.algo.discovery.ocel.ocpn import algorithm as ocpn_discovery - parameters = {} - parameters["inductive_miner_variant"] = inductive_miner_variant - parameters["diagnostics_with_token_based_replay"] = diagnostics_with_tbr + parameters = { + "inductive_miner_variant": inductive_miner_variant, + "diagnostics_with_token_based_replay": diagnostics_with_tbr, + } return ocpn_discovery.apply(ocel, parameters=parameters) def discover_objects_graph(ocel: OCEL, graph_type: str = "object_interaction") -> Set[Tuple[str, str]]: """ - Discovers an object graph from the provided object-centric event log + Discovers an object graph from the provided object-centric event log. - :param ocel: object-centric event log - :param graph_type: type of graph to consider (object_interaction, object_descendants, object_inheritance, object_cobirth, object_codeath) - :rtype: ``Dict[str, Any]`` + Available graph types: + - "object_interaction" + - "object_descendants" + - "object_inheritance" + - "object_cobirth" + - "object_codeath" + + :param ocel: Object-centric event log. + :type ocel: OCEL + :param graph_type: Type of graph to consider. + Options include "object_interaction", "object_descendants", + "object_inheritance", "object_cobirth", "object_codeath". + :type graph_type: str + :return: Discovered object graph as a set of tuples. + :rtype: Set[Tuple[str, str]] .. code-block:: python3 import pm4py ocel = pm4py.read_ocel('trial.ocel') - obj_graph = pm4py.ocel_discover_objects_graph(ocel, graph_type='object_interaction') + obj_graph = pm4py.discover_objects_graph(ocel, graph_type='object_interaction') """ if graph_type == "object_interaction": from pm4py.algo.transformation.ocel.graphs import object_interaction_graph @@ -299,17 +368,22 @@ def discover_objects_graph(ocel: OCEL, graph_type: str = "object_interaction") - elif graph_type == "object_codeath": from pm4py.algo.transformation.ocel.graphs import object_codeath_graph return object_codeath_graph.apply(ocel) + else: + raise ValueError(f"Unsupported graph_type: {graph_type}") def ocel_o2o_enrichment(ocel: OCEL, included_graphs: Optional[Collection[str]] = None) -> OCEL: """ - Inserts the information inferred from the graph computations (pm4py.discover_objects_graph) - in the list of O2O relations of the OCEL. - - :param ocel: object-centric event log - :param included_graphs: types of graphs to include, provided as list/set of strings (object_interaction_graph, object_descendants_graph, object_inheritance_graph, object_cobirth_graph, object_codeath_graph) - :rtype: ``OCEL`` + Enriches the OCEL with information inferred from graph computations by inserting them into the O2O relations. + :param ocel: Object-centric event log. + :type ocel: OCEL + :param included_graphs: Types of graphs to include, provided as a list or set of strings. + Options include "object_interaction_graph", "object_descendants_graph", + "object_inheritance_graph", "object_cobirth_graph", "object_codeath_graph". + :type included_graphs: Optional[Collection[str]] + :return: Enriched object-centric event log. + :rtype: OCEL .. code-block:: python3 @@ -325,11 +399,13 @@ def ocel_o2o_enrichment(ocel: OCEL, included_graphs: Optional[Collection[str]] = def ocel_e2o_lifecycle_enrichment(ocel: OCEL) -> OCEL: """ - Inserts lifecycle-based information (when an object is created/terminated or other types of relations) - in the list of E2O relations of the OCEL + Enriches the OCEL with lifecycle-based information, indicating when an object is created, terminated, + or has other types of relations, by updating the E2O relations. - :param ocel: object-centric event log - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: Enriched object-centric event log with lifecycle information. + :rtype: OCEL .. code-block:: python3 @@ -348,60 +424,74 @@ def ocel_e2o_lifecycle_enrichment(ocel: OCEL) -> OCEL: def sample_ocel_objects(ocel: OCEL, num_objects: int) -> OCEL: """ - Given an object-centric event log, returns a sampled event log with a subset of the objects - that is chosen in a random way. - Only the events related to at least one of these objects are filtered from the event log. - As a note, the relationships between the different objects are probably going to be ruined by - this sampling. + Returns a sampled object-centric event log containing a random subset of objects. + Only events related to at least one of the sampled objects are included in the returned log. + Note that this sampling may disrupt the relationships between different objects. - :param ocel: Object-centric event log - :param num_objects: Number of objects of the object-centric event log - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :param num_objects: Number of objects to include in the sampled event log. + :type num_objects: int + :return: Sampled object-centric event log. + :rtype: OCEL .. code-block:: python3 import pm4py ocel = pm4py.read_ocel('trial.ocel') - sampled_ocel = pm4py.sample_ocel_objects(ocel, 50) # keeps only 50 random objects + sampled_ocel = pm4py.sample_ocel_objects(ocel, 50) # Keeps only 50 random objects """ from pm4py.objects.ocel.util import sampling return sampling.sample_ocel_objects(ocel, parameters={"num_entities": num_objects}) -def sample_ocel_connected_components(ocel: OCEL, connected_components: int = 1, - max_num_events_per_cc: int = sys.maxsize, - max_num_objects_per_cc: int = sys.maxsize, - max_num_e2o_relations_per_cc: int = sys.maxsize) -> OCEL: +def sample_ocel_connected_components( + ocel: OCEL, + connected_components: int = 1, + max_num_events_per_cc: int = sys.maxsize, + max_num_objects_per_cc: int = sys.maxsize, + max_num_e2o_relations_per_cc: int = sys.maxsize +) -> OCEL: """ - Given an object-centric event log, returns a sampled event log with a subset of the executions. - The number of considered connected components need to be specified by the user. + Returns a sampled object-centric event log containing a specified number of connected components. + Users can also set maximum limits on the number of events, objects, and E2O relations per connected component. - Paper: - Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022. - - :param ocel: Object-centric event log - :param connected_components: Number of connected components to pick from the OCEL - :param max_num_events_per_cc: maximum number of events allowed per connected component (default: sys.maxsize) - :param max_num_objects_per_cc: maximum number of events allowed per connected component (default: sys.maxsize) - :param max_num_e2o_relations_per_cc: maximum number of event-to-object relationships allowed per connected component (default: sys.maxsize) - :rtype: ``OCEL`` + Reference paper: + Adams, Jan Niklas, et al. "Defining cases and variants for object-centric event data." + 2022 4th International Conference on Process Mining (ICPM). IEEE, 2022. + + :param ocel: Object-centric event log. + :type ocel: OCEL + :param connected_components: Number of connected components to include in the sampled event log. + :type connected_components: int + :param max_num_events_per_cc: Maximum number of events allowed per connected component (default: sys.maxsize). + :type max_num_events_per_cc: int + :param max_num_objects_per_cc: Maximum number of objects allowed per connected component (default: sys.maxsize). + :type max_num_objects_per_cc: int + :param max_num_e2o_relations_per_cc: Maximum number of event-to-object relationships allowed per connected component (default: sys.maxsize). + :type max_num_e2o_relations_per_cc: int + :return: Sampled object-centric event log containing the specified connected components. + :rtype: OCEL .. code-block:: python3 import pm4py ocel = pm4py.read_ocel('trial.ocel') - sampled_ocel = pm4py.sample_ocel_connected_components(ocel, 5) # keeps only 5 connected components + sampled_ocel = pm4py.sample_ocel_connected_components(ocel, 5) # Keeps only 5 connected components """ from pm4py.algo.transformation.ocel.split_ocel import algorithm ocel_splits = algorithm.apply(ocel, variant=algorithm.Variants.CONNECTED_COMPONENTS) events = None objects = None relations = None - ocel_splits = [x for x in ocel_splits if - len(x.events) <= max_num_events_per_cc and len(x.objects) <= max_num_objects_per_cc and len( - x.relations) <= max_num_e2o_relations_per_cc] + ocel_splits = [ + x for x in ocel_splits + if len(x.events) <= max_num_events_per_cc + and len(x.objects) <= max_num_objects_per_cc + and len(x.relations) <= max_num_e2o_relations_per_cc + ] if len(ocel_splits) > 0: ocel_splits = random.sample(ocel_splits, min(connected_components, len(ocel_splits))) @@ -421,12 +511,14 @@ def sample_ocel_connected_components(ocel: OCEL, connected_components: int = 1, def ocel_drop_duplicates(ocel: OCEL) -> OCEL: """ - Drop relations between events and objects happening at the same time, - with the same activity, to the same object identifier. - This ends up cleaning the OCEL from duplicate events. + Removes duplicate relations between events and objects that occur at the same time, + have the same activity, and are linked to the same object identifier. + This effectively cleans the OCEL by eliminating duplicate events. - :param ocel: object-centric event log - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: Cleaned object-centric event log without duplicate relations. + :rtype: OCEL .. code-block:: python3 @@ -434,23 +526,26 @@ def ocel_drop_duplicates(ocel: OCEL) -> OCEL: ocel = pm4py.read_ocel('trial.ocel') ocel = pm4py.ocel_drop_duplicates(ocel) - """ from pm4py.objects.ocel.util import filtering_utils ocel.relations = ocel.relations.drop_duplicates( - subset=[ocel.event_activity, ocel.event_timestamp, ocel.object_id_column]) + subset=[ocel.event_activity, ocel.event_timestamp, ocel.object_id_column] + ) ocel = filtering_utils.propagate_relations_filtering(ocel) return ocel -def ocel_merge_duplicates(ocel: OCEL, have_common_object: Optional[bool]=False) -> OCEL: +def ocel_merge_duplicates(ocel: OCEL, have_common_object: Optional[bool] = False) -> OCEL: """ - Merge events in the OCEL that happen with the same activity at the same timestamp + Merges events in the OCEL that have the same activity and timestamp. Optionally, ensures that + the events being merged share a common object. - :param ocel: object-centric event log - :param have_common_object: impose the additional merge condition that the two events should happen at the same - timestamp. - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :param have_common_object: If set to True, only merges events that share a common object. Defaults to False. + :type have_common_object: Optional[bool] + :return: Object-centric event log with merged duplicate events. + :rtype: OCEL .. code-block:: python3 @@ -463,14 +558,20 @@ def ocel_merge_duplicates(ocel: OCEL, have_common_object: Optional[bool]=False) import uuid relations = copy.copy(ocel.relations) if have_common_object: - relations["@@groupn"] = relations.groupby([ocel.object_id_column, ocel.event_activity, ocel.event_timestamp]).ngroup() + relations["@@groupn"] = relations.groupby( + [ocel.object_id_column, ocel.event_activity, ocel.event_timestamp] + ).ngroup() else: - relations["@@groupn"] = relations.groupby([ocel.event_activity, ocel.event_timestamp]).ngroup() + relations["@@groupn"] = relations.groupby( + [ocel.event_activity, ocel.event_timestamp] + ).ngroup() group_size = relations["@@groupn"].value_counts().to_dict() relations["@@groupsize"] = relations["@@groupn"].map(group_size) relations = relations.sort_values(["@@groupsize", "@@groupn"], ascending=False) - val_corr = {x: str(uuid.uuid4()) for x in pandas_utils.format_unique(relations["@@groupn"].unique())} + val_corr = { + x: str(uuid.uuid4()) for x in pandas_utils.format_unique(relations["@@groupn"].unique()) + } relations = relations.groupby(ocel.event_id_column).first()["@@groupn"].to_dict() relations = {x: val_corr[y] for x, y in relations.items()} @@ -483,17 +584,22 @@ def ocel_merge_duplicates(ocel: OCEL, have_common_object: Optional[bool]=False) return ocel - -def ocel_sort_by_additional_column(ocel: OCEL, additional_column: str, primary_column: str = "ocel:timestamp") -> OCEL: +def ocel_sort_by_additional_column( + ocel: OCEL, additional_column: str, primary_column: str = "ocel:timestamp" +) -> OCEL: """ - Sorts the OCEL not only based on the timestamp column and the index, - but using an additional sorting column that further determines the order of - the events happening at the same timestamp. + Sorts the OCEL based on the primary timestamp column and an additional column to determine + the order of events occurring at the same timestamp. - :param ocel: object-centric event log - :param additional_column: additional column to use for the sorting - :param primary_column: primary column to be used for the sorting (default: ocel:timestamp) - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :param additional_column: Additional column to use for sorting. + :type additional_column: str + :param primary_column: Primary column to use for sorting (default: "ocel:timestamp"). + Typically the timestamp column. + :type primary_column: str + :return: Sorted object-centric event log. + :rtype: OCEL .. code-block:: python3 @@ -501,7 +607,6 @@ def ocel_sort_by_additional_column(ocel: OCEL, additional_column: str, primary_c ocel = pm4py.read_ocel('trial.ocel') ocel = pm4py.ocel_sort_by_additional_column(ocel, 'ordering') - """ ocel.events = pandas_utils.insert_index(ocel.events, "@@index", reset_index=False, copy_dataframe=False) ocel.events = ocel.events.sort_values([primary_column, additional_column, "@@index"]) @@ -512,12 +617,13 @@ def ocel_sort_by_additional_column(ocel: OCEL, additional_column: str, primary_c def ocel_add_index_based_timedelta(ocel: OCEL) -> OCEL: """ - Adds a small time-delta to the timestamp column based on the current index of the event. - This ensures the correct ordering of the events in any object-centric process mining - solution. + Adds a small time delta to the timestamp column based on the event index to ensure the correct ordering + of events within any object-centric process mining solution. - :param ocel: object-centric event log - :rtype: ``OCEL`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :return: Object-centric event log with index-based time deltas added. + :rtype: OCEL .. code-block:: python3 @@ -525,7 +631,6 @@ def ocel_add_index_based_timedelta(ocel: OCEL) -> OCEL: ocel = pm4py.read_ocel('trial.ocel') ocel = pm4py.ocel_add_index_based_timedelta(ocel) - """ from datetime import timedelta eids = ocel.events[ocel.event_id_column].to_numpy().tolist() @@ -539,15 +644,22 @@ def ocel_add_index_based_timedelta(ocel: OCEL) -> OCEL: return ocel -def cluster_equivalent_ocel(ocel: OCEL, object_type: str, max_objs: int = sys.maxsize) -> Dict[str, Collection[OCEL]]: +def cluster_equivalent_ocel( + ocel: OCEL, object_type: str, max_objs: int = sys.maxsize +) -> Dict[str, Collection[OCEL]]: """ - Perform a clustering of the object-centric event log, based on the 'executions' of - a single object type. Equivalent 'executions' are grouped in the output dictionary. + Clusters the object-centric event log based on the 'executions' of a single object type. + Equivalent 'executions' are grouped together in the output dictionary. - :param ocel: object-centric event log - :param object_type: reference object type - :param max_objs: maximum number of objects (of the given object type) - :rtype: ``Dict[str, Collection[OCEL]]`` + :param ocel: Object-centric event log. + :type ocel: OCEL + :param object_type: Reference object type for clustering. + :type object_type: str + :param max_objs: Maximum number of objects (of the specified object type) to include per cluster. + Defaults to sys.maxsize. + :type max_objs: int + :return: Dictionary mapping cluster descriptions to collections of equivalent OCELs. + :rtype: Dict[str, Collection[OCEL]] .. code-block:: python3 @@ -559,7 +671,12 @@ def cluster_equivalent_ocel(ocel: OCEL, object_type: str, max_objs: int = sys.ma from pm4py.algo.transformation.ocel.split_ocel import algorithm as split_ocel_algorithm from pm4py.objects.ocel.util import rename_objs_ot_tim_lex from pm4py.algo.transformation.ocel.description import algorithm as ocel_description - lst_ocels = split_ocel_algorithm.apply(ocel, variant=split_ocel_algorithm.Variants.ANCESTORS_DESCENDANTS, parameters={"object_type": object_type, "max_objs": max_objs}) + + lst_ocels = split_ocel_algorithm.apply( + ocel, + variant=split_ocel_algorithm.Variants.ANCESTORS_DESCENDANTS, + parameters={"object_type": object_type, "max_objs": max_objs}, + ) ret = {} for index, oc in enumerate(lst_ocels): oc_ren = rename_objs_ot_tim_lex.apply(oc) diff --git a/pm4py/org.py b/pm4py/org.py index 31f7465da..b41c43958 100644 --- a/pm4py/org.py +++ b/pm4py/org.py @@ -20,7 +20,7 @@ Contact: info@processintelligence.solutions ''' __doc__ = """ -The ``pm4py.org`` module contains the organizational analysis techniques offered in ``pm4py`` +The ``pm4py.org`` module contains organizational analysis techniques offered in ``pm4py``. """ from typing import Union @@ -39,21 +39,28 @@ def discover_handover_of_work_network(log: Union[EventLog, pd.DataFrame], beta=0, resource_key: str = "org:resource", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> SNA: """ Calculates the handover of work network of the event log. - The handover of work network is essentially the DFG of the event log, however, using the - resource as a node of the graph, instead of the activity. - As such, to use this, resource information should be present in the event log. - :param log: event log / Pandas dataframe - :param beta: beta parameter for Handover metric - :param resource_key: attribute to be used for the resource - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + The handover of work network is essentially the Directly-Follows Graph (DFG) of the event log, but using the + resource as the nodes of the graph instead of activities. + As such, resource information should be present in the event log. + + :param log: Event log or Pandas DataFrame. + :param beta: Beta parameter for the Handover metric. + :param resource_key: Attribute to be used for the resource. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. .. code-block:: python3 import pm4py - metric = pm4py.discover_handover_of_work_network(dataframe, resource_key='org:resource', timestamp_key='time:timestamp', case_id_key='case:concept:name') + metric = pm4py.discover_handover_of_work_network( + dataframe, + beta=0, + resource_key='org:resource', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -70,18 +77,23 @@ def discover_handover_of_work_network(log: Union[EventLog, pd.DataFrame], beta=0 def discover_working_together_network(log: Union[EventLog, pd.DataFrame], resource_key: str = "org:resource", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> SNA: """ Calculates the working together network of the process. - Two nodes resources are connected in the graph if the resources collaborate on an instance of the process. + Two resource nodes are connected in the graph if the resources collaborate on an instance of the process. - :param log: event log / Pandas dataframe - :param resource_key: attribute to be used for the resource - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param resource_key: Attribute to be used for the resource. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. .. code-block:: python3 import pm4py - metric = pm4py.discover_working_together_network(dataframe, resource_key='org:resource', timestamp_key='time:timestamp', case_id_key='case:concept:name') + metric = pm4py.discover_working_together_network( + dataframe, + resource_key='org:resource', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -97,19 +109,25 @@ def discover_working_together_network(log: Union[EventLog, pd.DataFrame], resour def discover_activity_based_resource_similarity(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", resource_key: str = "org:resource", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> SNA: """ - Calculates similarity between the resources in the event log, based on their activity profiles. + Calculates similarity between the resources in the event log based on their activity profiles. - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param resource_key: attribute to be used for the resource - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param activity_key: Attribute to be used for the activity. + :param resource_key: Attribute to be used for the resource. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. .. code-block:: python3 import pm4py - act_res_sim = pm4py.discover_activity_based_resource_similarity(dataframe, resource_key='org:resource', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') + act_res_sim = pm4py.discover_activity_based_resource_similarity( + dataframe, + resource_key='org:resource', + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -127,17 +145,23 @@ def discover_subcontracting_network(log: Union[EventLog, pd.DataFrame], n=2, res """ Calculates the subcontracting network of the process. - :param log: event log / Pandas dataframe - :param n: n parameter for Subcontracting metric - :param resource_key: attribute to be used for the resource - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param n: N parameter for the Subcontracting metric. + :param resource_key: Attribute to be used for the resource. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. .. code-block:: python3 import pm4py - metric = pm4py.discover_subcontracting_network(dataframe, resource_key='org:resource', timestamp_key='time:timestamp', case_id_key='case:concept:name') + metric = pm4py.discover_subcontracting_network( + dataframe, + n=2, + resource_key='org:resource', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -153,24 +177,30 @@ def discover_subcontracting_network(log: Union[EventLog, pd.DataFrame], n=2, res def discover_organizational_roles(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", resource_key: str = "org:resource", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> List[Role]: """ - Mines the organizational roles + Mines the organizational roles. - A role is a set of activities in the log that are executed by a similar (multi)set of resources. Hence, it is a specific function into organization. Grouping the activities in roles can help: + A role is a set of activities in the log that are executed by a similar (multi)set of resources. Hence, it is a specific function within the organization. Grouping the activities into roles can help: Reference paper: Burattin, Andrea, Alessandro Sperduti, and Marco Veluscek. “Business models enhancement through discovery of roles.” 2013 IEEE Symposium on Computational Intelligence and Data Mining (CIDM). IEEE, 2013. - :param log: event log / Pandas dataframe - :param activity_key: attribute to be used for the activity - :param resource_key: attribute to be used for the resource - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier + :param log: Event log or Pandas DataFrame. + :param activity_key: Attribute to be used for the activity. + :param resource_key: Attribute to be used for the resource. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as case identifier. .. code-block:: python3 import pm4py - roles = pm4py.discover_organizational_roles(dataframe, resource_key='org:resource', activity_key='concept:name', timestamp_key='time:timestamp', case_id_key='case:concept:name') + roles = pm4py.discover_organizational_roles( + dataframe, + resource_key='org:resource', + activity_key='concept:name', + timestamp_key='time:timestamp', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) @@ -188,43 +218,54 @@ def discover_network_analysis(log: Union[pd.DataFrame, EventLog, EventStream], o """ Performs a network analysis of the log based on the provided parameters. - The classical social network analysis methods are based on the order of the events inside a case. For example, the Handover of Work metric considers the directly-follows relationships between resources during the work of a case. An edge is added between the two resources if such relationships occurs. + Classical social network analysis methods are based on the order of events within a case. For example, the Handover of Work metric considers the directly-follows relationships between resources during the execution of a case. An edge is added between two resources if such a relationship occurs. - Real-life scenarios may be more complicated. At first, is difficult to collect events inside the same case without having convergence/divergence issues (see first section of the OCEL part). At second, the type of relationship may also be important. Consider for example the relationship between two resources: this may be more efficient if the activity that is executed is liked by the resources, rather than disgusted. + Real-life scenarios may be more complicated. Firstly, it is difficult to collect events within the same case without encountering convergence/divergence issues (see the first section of the OCEL part). Secondly, the type of relationship may also be important. For example, the relationship between two resources may be more efficient if the activity executed is liked by the resources rather than disliked. - The network analysis that we introduce here generalizes some existing social network analysis metrics, becoming independent from the choice of a case notion and permitting to build a multi-graph instead of a simple graph. + The network analysis introduced here generalizes some existing social network analysis metrics, making them independent of the case notion and allowing the construction of a multigraph instead of a simple graph. - With this, we assume events to be linked by signals. An event emits a signal (that is contained as one attribute of the event) that is assumed to be received by other events (also, this is an attribute of these events) that follow the first event in the log. So, we assume there is an OUT attribute (of the event) that is identical to the IN attribute (of the other events). + We assume events are linked by signals. An event emits a signal (contained in one attribute of the event) that is assumed to be received by other events (also containing this attribute) that follow the first event in the log. We assume there is an OUT attribute (of the event) that is identical to the IN attribute (of the other events). - When we collect this information, we can build the network analysis graph: - - The source node of the relation is given by an aggregation over a node_column_source attribute. - - The target node of the relation is given by an aggregation over a node_column_target attribute. - - The type of edge is given by an aggregation over an edge_column attribute. - - The network analysis graph can either be annotated with frequency or performance information. + When collecting this information, we can build the network analysis graph: + - The source node of the relationship is determined by aggregating the `node_column_source` attribute. + - The target node of the relationship is determined by aggregating the `node_column_target` attribute. + - The type of edge is determined by aggregating the `edge_column` attribute. + - The network analysis graph can be annotated with frequency or performance information. The output is a multigraph. - Two events EV1 and EV2 of the log are merged (indipendently from the case notion) based on having + Two events EV1 and EV2 in the log are connected (independently of the case notion) based on having EV1.OUT_COLUMN = EV2.IN_COLUMN. - Then, an aggregation is applied on the couple of events (NODE_COLUMN) to obtain the nodes that are connected. - The edges between these nodes are aggregated based on some property of the *source* event (EDGE_COLUMN). - - :param log: event log / Pandas dataframe - :param out_column: the source column of the link (default: the case identifier; events of the same case are linked) - :param in_column: the target column of the link (default: the case identifier; events of the same case are linked) - :param node_column_source: the attribute to be used for the node definition of the source event (default: the resource of the log, org:resource) - :param node_column_target: the attribute to be used for the node definition of the target event (default: the resource of the log, org:resource) - :param edge_column: the attribute to be used for the edge definition (default: the activity of the log, concept:name) - :param edge_reference: decide if the edge attribute should be picked from the source event. Values: _out => the source event ; _in => the target event - :param performance: boolean value that enables the performance calculation on the edges of the network analysis - :param sorting_column: the column that should be used to sort the log before performing the network analysis (default: time:timestamp) - :param timestamp_column: the column that should be used as timestamp for the performance-related analysis (default: time:timestamp) + Then, an aggregation is applied on the pair of events (NODE_COLUMN) to obtain the connected nodes. + The edges between these nodes are aggregated based on some property of the *source* event (`edge_column`). + + :param log: Event log, Pandas DataFrame, or EventStream. + :param out_column: The source column of the link (default: the case identifier; events of the same case are linked). + :param in_column: The target column of the link (default: the case identifier; events of the same case are linked). + :param node_column_source: The attribute to be used for defining the source node (default: the resource of the log, "org:resource"). + :param node_column_target: The attribute to be used for defining the target node (default: the resource of the log, "org:resource"). + :param edge_column: The attribute to be used for defining the edge (default: the activity of the log, "concept:name"). + :param edge_reference: Determines if the edge attribute should be picked from the source event. Values: "_out" => the source event; "_in" => the target event. + :param performance: Boolean value that enables performance calculation on the edges of the network analysis. + :param sorting_column: The column to be used for sorting the log before performing the network analysis (default: "time:timestamp"). + :param timestamp_column: The column to be used as timestamp for performance-related analysis (default: "time:timestamp"). :rtype: ``Dict[Tuple[str, str], Dict[str, Any]]`` .. code-block:: python3 import pm4py - net_ana = pm4py.discover_network_analysis(dataframe, out_column='case:concept:name', in_column='case:concept:name', node_column_source='org:resource', node_column_target='org:resource', edge_column='concept:name') + net_ana = pm4py.discover_network_analysis( + dataframe, + out_column='case:concept:name', + in_column='case:concept:name', + node_column_source='org:resource', + node_column_target='org:resource', + edge_column='concept:name', + edge_reference='_out', + performance=False, + sorting_column='time:timestamp', + timestamp_column='time:timestamp' + ) """ __event_log_deprecation_warning(log) diff --git a/pm4py/read.py b/pm4py/read.py index dfbf954da..6fefe7a64 100644 --- a/pm4py/read.py +++ b/pm4py/read.py @@ -36,20 +36,32 @@ INDEX_COLUMN = "@@index" __doc__ = """ -The ``pm4py.read`` module contains all funcationality related to reading files/objects from disk. +The `pm4py.read` module contains all functionality related to reading files and objects from disk. """ -def read_xes(file_path: str, variant: Optional[str] = None, return_legacy_log_object: bool = constants.DEFAULT_READ_XES_LEGACY_OBJECT, encoding: str = constants.DEFAULT_ENCODING, **kwargs) -> Union[DataFrame, EventLog]: +def read_xes( + file_path: str, + variant: Optional[str] = None, + return_legacy_log_object: bool = constants.DEFAULT_READ_XES_LEGACY_OBJECT, + encoding: str = constants.DEFAULT_ENCODING, + **kwargs +) -> Union[DataFrame, EventLog]: """ - Reads an event log stored in XES format (see `xes-standard `_) - Returns a table (``pandas.DataFrame``) view of the event log. - - :param file_path: file path of the event log (``.xes`` file) on disk - :param variant: the variant of the importer to use. "iterparse" => traditional XML parser; "line_by_line" => text-based line-by-line importer ; "chunk_regex" => chunk-of-bytes importer (default); "iterparse20" => XES 2.0 importer - :param return_legacy_log_object: boolean value enabling returning a log object (default: False) - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``DataFrame`` + Reads an event log stored in XES format (see `xes-standard `_). + Returns a table (`pandas.DataFrame`) view of the event log or an `EventLog` object. + + :param file_path: Path to the event log (`.xes` file) on disk. + :param variant: Variant of the importer to use. Options include: + - "iterparse" – traditional XML parser, + - "line_by_line" – text-based line-by-line importer, + - "chunk_regex" – chunk-of-bytes importer (default), + - "iterparse20" – XES 2.0 importer, + - "rustxes" – Rust-based importer. + :param return_legacy_log_object: Boolean indicating whether to return a legacy `EventLog` object (default: `False`). + :param encoding: Encoding to be used (default: `utf-8`). + :param **kwargs: Additional parameters to pass to the importer. + :rtype: `pandas.DataFrame` or `pm4py.objects.log.obj.EventLog` .. code-block:: python3 @@ -95,18 +107,23 @@ def read_xes(file_path: str, variant: Optional[str] = None, return_legacy_log_ob return log -def read_pnml(file_path: str, auto_guess_final_marking: bool = False, encoding: str = constants.DEFAULT_ENCODING) -> Tuple[PetriNet, Marking, Marking]: +def read_pnml( + file_path: str, + auto_guess_final_marking: bool = False, + encoding: str = constants.DEFAULT_ENCODING +) -> Tuple[PetriNet, Marking, Marking]: """ - Reads a Petri net object from a .pnml file. - The Petri net object returned is a triple containing the following objects: + Reads a Petri net object from a `.pnml` file. + The returned Petri net object is a tuple containing: - 1. Petrinet Object, encoded as a ``PetriNet`` class - #. Initial Marking - #. Final Marking + 1. PetriNet object (`PetriNet`) + 2. Initial Marking (`Marking`) + 3. Final Marking (`Marking`) - :rtype: ``Tuple[PetriNet, Marking, Marking]`` - :param file_path: file path of the Petri net model (``.pnml`` file) on disk - :param encoding: the encoding to be used (default: utf-8) + :param file_path: Path to the Petri net model (`.pnml` file) on disk. + :param auto_guess_final_marking: Boolean indicating whether to automatically guess the final marking (default: `False`). + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `Tuple[PetriNet, Marking, Marking]` .. code-block:: python3 @@ -117,17 +134,23 @@ def read_pnml(file_path: str, auto_guess_final_marking: bool = False, encoding: if not os.path.exists(file_path): raise Exception("File does not exist") from pm4py.objects.petri_net.importer import importer as pnml_importer - net, im, fm = pnml_importer.apply(file_path, parameters={"auto_guess_final_marking": auto_guess_final_marking, "encoding": encoding}) + net, im, fm = pnml_importer.apply( + file_path, + parameters={ + "auto_guess_final_marking": auto_guess_final_marking, + "encoding": encoding + } + ) return net, im, fm def read_ptml(file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> ProcessTree: """ - Reads a process tree object from a .ptml file + Reads a process tree object from a `.ptml` file. - :param file_path: file path of the process tree object on disk - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``ProcessTree`` + :param file_path: Path to the process tree file on disk. + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `ProcessTree` .. code-block:: python3 @@ -142,46 +165,53 @@ def read_ptml(file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> Pro return tree -def read_dfg(file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> Tuple[Dict[Tuple[str,str],int], Dict[str,int], Dict[str,int]]: +def read_dfg( + file_path: str, + encoding: str = constants.DEFAULT_ENCODING +) -> Tuple[Dict[Tuple[str, str], int], Dict[str, int], Dict[str, int]]: """ - Reads a DFG object from a .dfg file. - The DFG object returned is a triple containing the following objects: + Reads a Directly-Follows Graph (DFG) from a `.dfg` file. + The returned DFG object is a tuple containing: - 1. DFG Object, encoded as a ``Dict[Tuple[str,str],int]``, s.t. ``DFG[('a','b')]=k`` implies that activity ``'a'`` is directly followed by activity ``'b'`` a total of ``k`` times in the log - #. Start activity dictionary, encoded as a ``Dict[str,int]``, s.t., ``S['a']=k`` implies that activity ``'a'`` is starting ``k`` traces in the event log - #. End activity dictionary, encoded as a ``Dict[str,int]``, s.t., ``E['z']=k`` implies that activity ``'z'`` is ending ``k`` traces in the event log. + 1. DFG (`Dict[Tuple[str, str], int]`): Maps pairs of activities to their occurrence count. + For example, `DFG[('a', 'b')] = k` indicates that activity `'a'` is directly followed by activity `'b'` a total of `k` times in the log. + 2. Start Activity Dictionary (`Dict[str, int]`): Maps activities to the number of traces they start. + For example, `S['a'] = k` implies that activity `'a'` starts `k` traces in the event log. + 3. End Activity Dictionary (`Dict[str, int]`): Maps activities to the number of traces they end. + For example, `E['z'] = k` implies that activity `'z'` ends `k` traces in the event log. - :rtype: ``Tuple[Dict[Tuple[str,str],int], Dict[str,int], Dict[str,int]]`` - :param file_path: file path of the dfg model on disk - :param encoding: the encoding to be used (default: utf-8) + :param file_path: Path to the DFG model file on disk. + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `Tuple[Dict[Tuple[str, str], int], Dict[str, int], Dict[str, int]]` .. code-block:: python3 - import pm4py + import pm4py - dfg = pm4py.read_dfg("") + dfg = pm4py.read_dfg("") """ if not os.path.exists(file_path): raise Exception("File does not exist") from pm4py.objects.dfg.importer import importer as dfg_importer - dfg, start_activities, end_activities = dfg_importer.apply(file_path, parameters={"encoding": encoding}) + dfg, start_activities, end_activities = dfg_importer.apply( + file_path, parameters={"encoding": encoding} + ) return dfg, start_activities, end_activities def read_bpmn(file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> BPMN: """ - Reads a BPMN model from a .bpmn file + Reads a BPMN model from a `.bpmn` file. - :param file_path: file path of the bpmn model - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``BPMN`` + :param file_path: Path to the BPMN model file on disk. + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `BPMN` .. code-block:: python3 import pm4py bpmn = pm4py.read_bpmn('') - """ if not os.path.exists(file_path): raise Exception("File does not exist") @@ -190,15 +220,19 @@ def read_bpmn(file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> BPM return bpmn_graph -def read_ocel(file_path: str, objects_path: Optional[str] = None, encoding: str = constants.DEFAULT_ENCODING) -> OCEL: +def read_ocel( + file_path: str, + objects_path: Optional[str] = None, + encoding: str = constants.DEFAULT_ENCODING +) -> OCEL: """ Reads an object-centric event log from a file (see: http://www.ocel-standard.org/). - The ``OCEL`` object is returned by this method + Returns an `OCEL` object. - :param file_path: file path of the object-centric event log - :param objects_path: [Optional] file path from which the objects dataframe should be read - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``OCEL`` + :param file_path: Path to the object-centric event log file. + :param objects_path: [Optional] Path to the objects dataframe file. + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `OCEL` .. code-block:: python3 @@ -216,18 +250,22 @@ def read_ocel(file_path: str, objects_path: Optional[str] = None, encoding: str return read_ocel_xml(file_path, encoding=encoding) elif file_path.lower().endswith(".sqlite"): return read_ocel_sqlite(file_path, encoding=encoding) - raise Exception("unsupported file format") + raise Exception("Unsupported file format") -def read_ocel_csv(file_path: str, objects_path: Optional[str] = None, encoding: str = constants.DEFAULT_ENCODING) -> OCEL: +def read_ocel_csv( + file_path: str, + objects_path: Optional[str] = None, + encoding: str = constants.DEFAULT_ENCODING +) -> OCEL: """ Reads an object-centric event log from a CSV file (see: http://www.ocel-standard.org/). - The ``OCEL`` object is returned by this method + Returns an `OCEL` object. - :param file_path: file path of the object-centric event log (.csv) - :param objects_path: [Optional] file path from which the objects dataframe should be read - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``OCEL`` + :param file_path: Path to the object-centric event log file (`.csv`). + :param objects_path: [Optional] Path to the objects dataframe file. + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `OCEL` .. code-block:: python3 @@ -239,17 +277,21 @@ def read_ocel_csv(file_path: str, objects_path: Optional[str] = None, encoding: raise Exception("File does not exist") from pm4py.objects.ocel.importer.csv import importer as csv_importer - return csv_importer.apply(file_path, objects_path=objects_path, parameters={"encoding": encoding}) + return csv_importer.apply( + file_path, + objects_path=objects_path, + parameters={"encoding": encoding} + ) def read_ocel_json(file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> OCEL: """ Reads an object-centric event log from a JSON-OCEL file (see: http://www.ocel-standard.org/). - The ``OCEL`` object is returned by this method + Returns an `OCEL` object. - :param file_path: file path of the object-centric event log (.jsonocel) - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``OCEL`` + :param file_path: Path to the object-centric event log file (`.jsonocel`). + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `OCEL` .. code-block:: python3 @@ -261,17 +303,21 @@ def read_ocel_json(file_path: str, encoding: str = constants.DEFAULT_ENCODING) - raise Exception("File does not exist") from pm4py.objects.ocel.importer.jsonocel import importer as jsonocel_importer - return jsonocel_importer.apply(file_path, variant=jsonocel_importer.Variants.CLASSIC, parameters={"encoding": encoding}) + return jsonocel_importer.apply( + file_path, + variant=jsonocel_importer.Variants.CLASSIC, + parameters={"encoding": encoding} + ) def read_ocel_xml(file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> OCEL: """ - Reads an object-centric event log from a XML-OCEL file (see: http://www.ocel-standard.org/). - The ``OCEL`` object is returned by this method + Reads an object-centric event log from an XML-OCEL file (see: http://www.ocel-standard.org/). + Returns an `OCEL` object. - :param file_path: file path of the object-centric event log (.xmlocel) - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``OCEL`` + :param file_path: Path to the object-centric event log file (`.xmlocel`). + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `OCEL` .. code-block:: python3 @@ -283,17 +329,21 @@ def read_ocel_xml(file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> raise Exception("File does not exist") from pm4py.objects.ocel.importer.xmlocel import importer as xmlocel_importer - return xmlocel_importer.apply(file_path, variant=xmlocel_importer.Variants.CLASSIC, parameters={"encoding": encoding}) + return xmlocel_importer.apply( + file_path, + variant=xmlocel_importer.Variants.CLASSIC, + parameters={"encoding": encoding} + ) def read_ocel_sqlite(file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> OCEL: """ Reads an object-centric event log from a SQLite database (see: http://www.ocel-standard.org/). - The ``OCEL`` object is returned by this method + Returns an `OCEL` object. - :param file_path: file path of the SQLite database (.sqlite) - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``OCEL`` + :param file_path: Path to the SQLite database file (`.sqlite`). + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `OCEL` .. code-block:: python3 @@ -305,17 +355,30 @@ def read_ocel_sqlite(file_path: str, encoding: str = constants.DEFAULT_ENCODING) raise Exception("File does not exist") from pm4py.objects.ocel.importer.sqlite import importer as sqlite_importer - return sqlite_importer.apply(file_path, variant=sqlite_importer.Variants.PANDAS_IMPORTER, parameters={"encoding": encoding}) - - -def read_ocel2(file_path: str, variant_str: Optional[str] = None, encoding: str = constants.DEFAULT_ENCODING) -> OCEL: + return sqlite_importer.apply( + file_path, + variant=sqlite_importer.Variants.PANDAS_IMPORTER, + parameters={"encoding": encoding} + ) + + +def read_ocel2( + file_path: str, + variant_str: Optional[str] = None, + encoding: str = constants.DEFAULT_ENCODING +) -> OCEL: """ - Reads an OCEL2.0 event log + Reads an OCEL 2.0 event log. + + :param file_path: Path to the OCEL 2.0 event log file. + :param variant_str: [Optional] Specification of the importer variant to be used. + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `OCEL` - :param file_path: path to the OCEL2.0 event log - :param variant_str: (optional) specification of the importer variant to be used - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``OCEL`` + Supported file formats based on extension: + - `.sqlite` – SQLite database, + - `.xml` or `.xmlocel` – XML file, + - `.json` or `.jsonocel` – JSON file. .. code-block:: python3 @@ -334,14 +397,18 @@ def read_ocel2(file_path: str, variant_str: Optional[str] = None, encoding: str return read_ocel2_json(file_path, variant_str=variant_str, encoding=encoding) -def read_ocel2_json(file_path: str, variant_str: Optional[str] = None, encoding: str = constants.DEFAULT_ENCODING) -> OCEL: +def read_ocel2_json( + file_path: str, + variant_str: Optional[str] = None, + encoding: str = constants.DEFAULT_ENCODING +) -> OCEL: """ - Reads an OCEL2.0 event log from a JSON-OCEL(2) file + Reads an OCEL 2.0 event log from a JSON-OCEL2 file. - :param file_path: path to the JSON file - :param variant_str: (optional) specification of the importer variant to be used - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``OCEL`` + :param file_path: Path to the JSON file (`.jsonocel`). + :param variant_str: [Optional] Specification of the importer variant to be used. + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `OCEL` .. code-block:: python3 @@ -357,17 +424,25 @@ def read_ocel2_json(file_path: str, variant_str: Optional[str] = None, encoding: if variant_str == "ocel20_rustxes": variant = jsonocel_importer.Variants.OCEL20_RUSTXES - return jsonocel_importer.apply(file_path, variant=variant, parameters={"encoding": encoding}) + return jsonocel_importer.apply( + file_path, + variant=variant, + parameters={"encoding": encoding} + ) -def read_ocel2_sqlite(file_path: str, variant_str: Optional[str] = None, encoding: str = constants.DEFAULT_ENCODING) -> OCEL: +def read_ocel2_sqlite( + file_path: str, + variant_str: Optional[str] = None, + encoding: str = constants.DEFAULT_ENCODING +) -> OCEL: """ - Reads an OCEL2.0 event log from a SQLite database + Reads an OCEL 2.0 event log from a SQLite database. - :param file_path: path to the OCEL2.0 database - :param variant_str: (optional) specification of the importer variant to be used - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``OCEL`` + :param file_path: Path to the OCEL 2.0 SQLite database file (`.sqlite`). + :param variant_str: [Optional] Specification of the importer variant to be used. + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `OCEL` .. code-block:: python3 @@ -379,17 +454,25 @@ def read_ocel2_sqlite(file_path: str, variant_str: Optional[str] = None, encodin raise Exception("File does not exist") from pm4py.objects.ocel.importer.sqlite import importer as sqlite_importer - return sqlite_importer.apply(file_path, variant=sqlite_importer.Variants.OCEL20, parameters={"encoding": encoding}) - - -def read_ocel2_xml(file_path: str, variant_str: Optional[str] = None, encoding: str = constants.DEFAULT_ENCODING) -> OCEL: + return sqlite_importer.apply( + file_path, + variant=sqlite_importer.Variants.OCEL20, + parameters={"encoding": encoding} + ) + + +def read_ocel2_xml( + file_path: str, + variant_str: Optional[str] = None, + encoding: str = constants.DEFAULT_ENCODING +) -> OCEL: """ - Reads an OCEL2.0 event log from an XML file + Reads an OCEL 2.0 event log from an XML file. - :param file_path: path to the OCEL2.0 event log - :param variant_str: (optional) specification of the importer variant to be used - :param encoding: the encoding to be used (default: utf-8) - :rtype: ``OCEL`` + :param file_path: Path to the OCEL 2.0 XML file (`.xmlocel`). + :param variant_str: [Optional] Specification of the importer variant to be used. + :param encoding: Encoding to be used (default: `utf-8`). + :rtype: `OCEL` .. code-block:: python3 @@ -405,4 +488,8 @@ def read_ocel2_xml(file_path: str, variant_str: Optional[str] = None, encoding: if variant_str == "ocel20_rustxes": variant = xml_importer.Variants.OCEL20_RUSTXES - return xml_importer.apply(file_path, variant=variant, parameters={"encoding": encoding}) + return xml_importer.apply( + file_path, + variant=variant, + parameters={"encoding": encoding} + ) diff --git a/pm4py/sim.py b/pm4py/sim.py index 9aadc6121..fa9d31cd1 100644 --- a/pm4py/sim.py +++ b/pm4py/sim.py @@ -20,7 +20,7 @@ Contact: info@processintelligence.solutions ''' __doc__ = """ -The ``pm4py.sim`` module contains the simulation algorithms offered in ``pm4py`` +The ``pm4py.sim`` module contains simulation algorithms provided by ``pm4py``. """ from collections import Counter @@ -33,15 +33,21 @@ def play_out(*args: Union[Tuple[PetriNet, Marking, Marking], dict, Counter, ProcessTree], **kwargs) -> EventLog: """ - Performs the playout of the provided model, - i.e., gets a set of traces from the model. - The function either takes a petri net, initial and final marking, or, a process tree as an input. - - :param args: model (Petri net with initial and final marking, or process tree) - :param kwargs: optional parameters of the method, including: - - parameters: dictionary containing the parameters of the playout, including: - - smap: (if provided) stochastic map to be used to stochastically choose the transition - - log: (if provided) EventLog to be used to compute the stochastic map, if smap not provided + Performs the playout of the provided model, generating a set of traces. + + The function accepts one of the following inputs: + - A Petri net with initial and final markings. + - A Directly-Follows Graph (DFG) represented as a dictionary. + - A process tree. + + :param args: + - For Petri net playout: a `PetriNet`, an initial `Marking`, and a final `Marking`. + - For DFG playout: a `dict` representing the DFG, followed by additional required arguments. + - For process tree playout: a single `ProcessTree`. + :param kwargs: Optional parameters of the method, including: + - `parameters`: A dictionary containing parameters of the playout, such as: + - `smap`: (optional) A stochastic map to be used for probabilistic transition selection. + - `log`: (optional) An `EventLog` used to compute the stochastic map if `smap` is not provided. :rtype: ``EventLog`` .. code-block:: python3 @@ -85,20 +91,20 @@ def play_out(*args: Union[Tuple[PetriNet, Marking, Marking], dict, Counter, Proc return dfg_playout.apply(args[0], args[1], args[2], **kwargs) elif len(args) == 1: from pm4py.objects.process_tree.obj import ProcessTree - if type(args[0]) is ProcessTree: + if isinstance(args[0], ProcessTree): from pm4py.algo.simulation.playout.process_tree import algorithm return algorithm.apply(args[0], **kwargs) - raise Exception("unsupported model for playout") + raise Exception("Unsupported model for playout") def generate_process_tree(**kwargs) -> ProcessTree: """ - Generates a process tree + Generates a process tree. Reference paper: PTandLogGenerator: A Generator for Artificial Event Data - :param kwargs: dictionary containing the parameters of the process tree generator algorithm + :param kwargs: Parameters for the process tree generator algorithm. :rtype: ``ProcessTree`` .. code-block:: python3 diff --git a/pm4py/stats.py b/pm4py/stats.py index c44407515..0df5b7424 100644 --- a/pm4py/stats.py +++ b/pm4py/stats.py @@ -20,7 +20,7 @@ Contact: info@processintelligence.solutions ''' __doc__ = """ -The ``pm4py.stats`` module contains the statistics offered in ``pm4py`` +The ``pm4py.stats`` module contains the statistical functionalities offered in ``pm4py``. """ import sys @@ -32,7 +32,11 @@ import pandas as pd from pm4py.objects.log.obj import EventLog, Trace, EventStream -from pm4py.util.pandas_utils import check_is_pandas_dataframe, check_pandas_dataframe_columns, insert_ev_in_tr_index +from pm4py.util.pandas_utils import ( + check_is_pandas_dataframe, + check_pandas_dataframe_columns, + insert_ev_in_tr_index, +) from pm4py.utils import get_properties, __event_log_deprecation_warning from pm4py.util import constants, pandas_utils from pm4py.objects.petri_net.obj import PetriNet @@ -40,70 +44,114 @@ import deprecation -def get_start_activities(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[str, int]: +def get_start_activities( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> Dict[str, int]: """ - Returns the start activities from a log object + Returns the start activities and their frequencies from a log object. - :param log: Log object - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[str, int]`` + :param log: Log object (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A dictionary mapping start activity names to their frequencies. .. code-block:: python3 import pm4py - start_activities = pm4py.get_start_activities(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + start_activities = pm4py.get_start_activities( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.statistics.start_activities.pandas import get + return get.get_start_activities(log, parameters=properties) else: from pm4py.statistics.start_activities.log import get + return get.get_start_activities(log, parameters=properties) -def get_end_activities(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[str, int]: +def get_end_activities( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> Dict[str, int]: """ - Returns the end activities of a log + Returns the end activities and their frequencies from a log object. - :param log: Log object - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[str, int]`` + :param log: Log object (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A dictionary mapping end activity names to their frequencies. .. code-block:: python3 import pm4py - end_activities = pm4py.get_end_activities(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + end_activities = pm4py.get_end_activities( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.statistics.end_activities.pandas import get + return get.get_end_activities(log, parameters=properties) else: from pm4py.statistics.end_activities.log import get + return get.get_end_activities(log, parameters=properties) def get_event_attributes(log: Union[EventLog, pd.DataFrame]) -> List[str]: """ - Returns the attributes at the event level of the log + Returns the list of event-level attributes in the log. - :param log: Log object - :rtype: ``List[str]`` + :param log: Log object (EventLog or pandas DataFrame). + :return: A list of event attribute names. .. code-block:: python3 @@ -118,15 +166,16 @@ def get_event_attributes(log: Union[EventLog, pd.DataFrame]) -> List[str]: return list(log.columns) else: from pm4py.statistics.attributes.log import get + return list(get.get_all_event_attributes_from_log(log)) def get_trace_attributes(log: Union[EventLog, pd.DataFrame]) -> List[str]: """ - Gets the attributes at the trace level of a log object + Returns the list of trace-level attributes in the log. - :param log: Log object - :rtype: ``List[str]`` + :param log: Log object (EventLog or pandas DataFrame). + :return: A list of trace attribute names. .. code-block:: python3 @@ -139,75 +188,109 @@ def get_trace_attributes(log: Union[EventLog, pd.DataFrame]) -> List[str]: from pm4py.util import constants if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) - return [x for x in list(log.columns) if x.startswith(constants.CASE_ATTRIBUTE_PREFIX)] + return [ + x + for x in list(log.columns) + if x.startswith(constants.CASE_ATTRIBUTE_PREFIX) + ] else: from pm4py.statistics.attributes.log import get + return list(get.get_all_trace_attributes_from_log(log)) -def get_event_attribute_values(log: Union[EventLog, pd.DataFrame], attribute: str, count_once_per_case=False, case_id_key: str = "case:concept:name") -> Dict[str, int]: +def get_event_attribute_values( + log: Union[EventLog, pd.DataFrame], + attribute: str, + count_once_per_case: bool = False, + case_id_key: str = "case:concept:name", +) -> Dict[str, int]: """ - Returns the values for a specified (event) attribute + Returns the values and their frequencies for a specified event attribute. - :param log: Log object - :param attribute: attribute - :param count_once_per_case: If True, consider only an occurrence of the given attribute value inside a case (if there are multiple events sharing the same attribute value, count only 1 occurrence) - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[str, int]`` + :param log: Log object (EventLog or pandas DataFrame). + :param attribute: The event attribute to analyze. + :param count_once_per_case: If True, count each attribute value at most once per case. + :param case_id_key: Attribute to be used as the case identifier. + :return: A dictionary mapping attribute values to their frequencies. .. code-block:: python3 import pm4py - activities = pm4py.get_event_attribute_values(dataframe, 'concept:name', case_id_key='case:concept:name') + activities = pm4py.get_event_attribute_values( + dataframe, + 'concept:name', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) - parameters = get_properties(log, case_id_key=case_id_key) + parameters = get_properties( + log, case_id_key=case_id_key + ) parameters["keep_once_per_case"] = count_once_per_case if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, case_id_key=case_id_key + ) from pm4py.statistics.attributes.pandas import get + return get.get_attribute_values(log, attribute, parameters=parameters) else: from pm4py.statistics.attributes.log import get + return get.get_attribute_values(log, attribute, parameters=parameters) -def get_trace_attribute_values(log: Union[EventLog, pd.DataFrame], attribute: str, case_id_key: str = "case:concept:name") -> Dict[str, int]: +def get_trace_attribute_values( + log: Union[EventLog, pd.DataFrame], + attribute: str, + case_id_key: str = "case:concept:name", +) -> Dict[str, int]: """ - Returns the values for a specified trace attribute + Returns the values and their frequencies for a specified trace attribute. - :param log: Log object - :param attribute: Attribute - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[str, int]`` + :param log: Log object (EventLog or pandas DataFrame). + :param attribute: The trace attribute to analyze. + :param case_id_key: Attribute to be used as the case identifier. + :return: A dictionary mapping trace attribute values to their frequencies. .. code-block:: python3 import pm4py - tr_attr_values = pm4py.get_trace_attribute_values(dataframe, 'case:attribute', case_id_key='case:concept:name') + tr_attr_values = pm4py.get_trace_attribute_values( + dataframe, + 'case:attribute', + case_id_key='case:concept:name' + ) """ __event_log_deprecation_warning(log) - parameters = get_properties(log, case_id_key=case_id_key) + parameters = get_properties( + log, case_id_key=case_id_key + ) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, case_id_key=case_id_key + ) from pm4py.statistics.attributes.pandas import get + if attribute not in log and constants.CASE_ATTRIBUTE_PREFIX + attribute in log: - # if "attribute" does not exist as column, but "case:attribute" exists, then use that + # If "attribute" does not exist as a column, but "case:attribute" exists, then use that. attribute = constants.CASE_ATTRIBUTE_PREFIX + attribute ret = get.get_attribute_values(log, attribute, parameters=parameters) return ret else: from pm4py.statistics.attributes.log import get + ret = get.get_trace_attribute_values(log, attribute, parameters=parameters) if not ret: - # if the provided attribute does not exist, but starts with "case:", try to get the attribute values - # removing the "case:" at the beginning + # If the provided attribute does not exist, but starts with "case:", try to get the attribute values + # by removing the "case:" prefix. if attribute.startswith(constants.CASE_ATTRIBUTE_PREFIX): attribute = attribute.split(constants.CASE_ATTRIBUTE_PREFIX)[-1] ret = get.get_trace_attribute_values(log, attribute, parameters=parameters) @@ -215,80 +298,126 @@ def get_trace_attribute_values(log: Union[EventLog, pd.DataFrame], attribute: st return ret -def get_variants(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", max_repetitions: int = sys.maxsize) -> Union[Dict[Tuple[str], List[Trace]], Dict[Tuple[str], int]]: +def get_variants( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + max_repetitions: int = sys.maxsize, +) -> Union[Dict[Tuple[str], List[Trace]], Dict[Tuple[str], int]]: """ - Gets the variants from the log + Retrieves the variants from the log. - :param log: Event log - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param max_repetitions: maximum number of consecutive repetitions for an activity. - For example, {('A', 'B', 'C'): 3, ('A', 'B', 'B', 'B', 'C'): 2, ('A', 'B', 'B', 'B', 'B', 'B', 'C'): 1} would be reduced to: {('A', 'B', 'C'): 6} if max_repetitions=1; {('A', 'B', 'C'): 3, ('A', 'B', 'B', 'C'): 3} if max_repetitions=2; {('A', 'B', 'C'): 3, ('A', 'B', 'B', 'B', 'C'): 3} if max_repetitions=3; {('A', 'B', 'C'): 3, ('A', 'B', 'B', 'B', 'C'): 2, ('A', 'B', 'B', 'B', 'B', 'C'): 1} if max_repetitions=4; - :rtype: ``Dict[Tuple[str], List[Trace]]`` + :param log: Event log (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :param max_repetitions: Maximum number of consecutive repetitions for an activity. + Reduces variants by limiting consecutive activity repetitions. + :return: A dictionary mapping activity tuples to their counts or lists of traces. .. code-block:: python3 import pm4py - variants = pm4py.get_variants(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + variants = pm4py.get_variants( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ - return get_variants_as_tuples(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key, max_repetitions=max_repetitions) - - -def get_variants_as_tuples(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", max_repetitions: int = sys.maxsize) -> Union[Dict[Tuple[str], List[Trace]], Dict[Tuple[str], int]]: + return get_variants_as_tuples( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + max_repetitions=max_repetitions, + ) + + +def get_variants_as_tuples( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + max_repetitions: int = sys.maxsize, +) -> Union[Dict[Tuple[str], List[Trace]], Dict[Tuple[str], int]]: """ - Gets the variants from the log (where the keys are tuples and not strings) + Retrieves the variants from the log, where the variant keys are tuples. - :param log: Event log - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param max_repetitions: maximum number of consecutive repetitions for an activity. - For example, {('A', 'B', 'C'): 3, ('A', 'B', 'B', 'B', 'C'): 2, ('A', 'B', 'B', 'B', 'B', 'B', 'C'): 1} would be reduced to: {('A', 'B', 'C'): 6} if max_repetitions=1; {('A', 'B', 'C'): 3, ('A', 'B', 'B', 'C'): 3} if max_repetitions=2; {('A', 'B', 'C'): 3, ('A', 'B', 'B', 'B', 'C'): 3} if max_repetitions=3; {('A', 'B', 'C'): 3, ('A', 'B', 'B', 'B', 'C'): 2, ('A', 'B', 'B', 'B', 'B', 'C'): 1} if max_repetitions=4; - :rtype: ``Dict[Tuple[str], List[Trace]]`` + :param log: Event log (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :param max_repetitions: Maximum number of consecutive repetitions for an activity. + Reduces variants by limiting consecutive activity repetitions. + :return: A dictionary mapping activity tuples to their counts or lists of traces. .. code-block:: python3 import pm4py - variants = pm4py.get_variants_as_tuples(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + variants = pm4py.get_variants_as_tuples( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.statistics.variants.pandas import get + variants = get.get_variants_count(log, parameters=properties) else: from pm4py.statistics.variants.log import get + variants = get.get_variants(log, parameters=properties) if max_repetitions < sys.maxsize: from pm4py.util import variants_util - variants = variants_util.aggregate_consecutive_activities_in_variants(variants, max_repetitions=max_repetitions) + + variants = variants_util.aggregate_consecutive_activities_in_variants( + variants, max_repetitions=max_repetitions + ) return variants -def split_by_process_variant(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", - timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", - variant_column: str = "@@variant_column", - index_in_trace_column: str = "@@index_in_trace") -> Iterator[ - Tuple[Collection[str], pd.DataFrame]]: +def split_by_process_variant( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + variant_column: str = "@@variant_column", + index_in_trace_column: str = "@@index_in_trace", +) -> Iterator[Tuple[Collection[str], pd.DataFrame]]: """ Splits an event log into sub-dataframes for each process variant. - The result is an iterator over the variants along with the sub-dataframes. + The result is an iterator over the variants along with their corresponding sub-dataframes. - :param log: Event log - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param variant_column: name of the utility column that stores the variant's tuple - :param index_in_trace_column: name of the utility column that stores the index of the event in the case - :rtype: ``Iterator[Tuple[Collection[str], pd.DataFrame]]`` + :param log: Event log (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :param variant_column: Name of the utility column that stores the variant's tuple. + :param index_in_trace_column: Name of the utility column that stores the index of the event in the case. + :return: An iterator of tuples, each containing a variant and its corresponding sub-dataframe. .. code-block:: python3 @@ -304,14 +433,29 @@ def split_by_process_variant(log: Union[EventLog, pd.DataFrame], activity_key: s __event_log_deprecation_warning(log) import pm4py + log = pm4py.convert_to_dataframe(log) - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.util import pandas_utils - log = pandas_utils.insert_ev_in_tr_index(log, case_id=case_id_key, column_name=index_in_trace_column) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + + log = pandas_utils.insert_ev_in_tr_index( + log, case_id=case_id_key, column_name=index_in_trace_column + ) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.objects.log.util import pandas_numpy_variants + variants_dict, case_variant = pandas_numpy_variants.apply(log, parameters=properties) log[variant_column] = log[case_id_key].map(case_variant) @@ -320,34 +464,37 @@ def split_by_process_variant(log: Union[EventLog, pd.DataFrame], activity_key: s yield variant, filtered_log -def get_variants_paths_duration(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", - timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", - variant_column: str = "@@variant_column", - variant_count: str = "@@variant_count", - index_in_trace_column: str = "@@index_in_trace", - cumulative_occ_path_column: str = "@@cumulative_occ_path_column", - times_agg: str = "mean") -> pd.DataFrame: +def get_variants_paths_duration( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", + variant_column: str = "@@variant_column", + variant_count: str = "@@variant_count", + index_in_trace_column: str = "@@index_in_trace", + cumulative_occ_path_column: str = "@@cumulative_occ_path_column", + times_agg: str = "mean", +) -> pd.DataFrame: """ - Method that associates to a log object a Pandas dataframe aggregated by variants and positions (inside the variant). - Each row is associated to different columns: + Associates a pandas DataFrame aggregated by variants and their positions within each variant. + Each row includes: - The variant - - The position (in the variant) - - The source activity (of the path) - - The target activity (of the path) - - An aggregation of the times between the two activities (for example, the mean over all the cases of the same variant) - - The cumulative occurrences of the path inside the case (for example, the first A->B would be associated to 0, - and the second A->B would be associated to 1) - - :param log: Event log - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :param variant_column: name of the utility column that stores the variant's tuple - :param variant_count: name of the utility column that stores the variant's number of occurrences - :param index_in_trace_column: name of the utility column that stores the index of the event in the case - :param cumulative_occ_path_column: name of the column that stores the cumulative occurrences of the path inside the case - :param times_agg: aggregation (mean, median) to be used - :rtype: ``pd.DataFrame`` + - The position within the variant + - The source activity of the path + - The target activity of the path + - An aggregation of the times between the two activities (e.g., mean) + - The cumulative occurrences of the path within the case + + :param log: Event log (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :param variant_column: Name of the utility column that stores the variant's tuple. + :param variant_count: Name of the utility column that stores the variant's occurrence count. + :param index_in_trace_column: Name of the utility column that stores the index of the event in the case. + :param cumulative_occ_path_column: Name of the column that stores the cumulative occurrences of the path within the case. + :param times_agg: Aggregation function to be used for time differences (e.g., "mean", "median"). + :return: A pandas DataFrame with the aggregated variant paths and durations. .. code-block:: python3 @@ -361,379 +508,665 @@ def get_variants_paths_duration(log: Union[EventLog, pd.DataFrame], activity_key print(var_paths_durs) """ __event_log_deprecation_warning(log) - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) list_to_concat = [] - for variant, filtered_log in split_by_process_variant(log, activity_key=activity_key, timestamp_key=timestamp_key, - case_id_key=case_id_key, variant_column=variant_column, - index_in_trace_column=index_in_trace_column): + for variant, filtered_log in split_by_process_variant( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + variant_column=variant_column, + index_in_trace_column=index_in_trace_column, + ): from pm4py.statistics.eventually_follows.pandas import get as eventually_follows - dir_follo_dataframe = eventually_follows.get_partial_order_dataframe(filtered_log.copy(), activity_key=activity_key, - timestamp_key=timestamp_key, - case_id_glue=case_id_key, - sort_caseid_required=False, - sort_timestamp_along_case_id=False, - reduce_dataframe=False) + + dir_follo_dataframe = eventually_follows.get_partial_order_dataframe( + filtered_log.copy(), + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_glue=case_id_key, + sort_caseid_required=False, + sort_timestamp_along_case_id=False, + reduce_dataframe=False, + ) dir_follo_dataframe[cumulative_occ_path_column] = dir_follo_dataframe.groupby( - [case_id_key, activity_key, activity_key + "_2"]).cumcount() + [case_id_key, activity_key, activity_key + "_2"] + ).cumcount() dir_follo_dataframe = dir_follo_dataframe[ - [index_in_trace_column, constants.DEFAULT_FLOW_TIME, cumulative_occ_path_column]].groupby( - index_in_trace_column).agg( - {constants.DEFAULT_FLOW_TIME: times_agg, cumulative_occ_path_column: 'min'}).reset_index() - dir_follo_dataframe[activity_key] = dir_follo_dataframe[index_in_trace_column].apply(lambda x: variant[x]) + [index_in_trace_column, constants.DEFAULT_FLOW_TIME, cumulative_occ_path_column] + ].groupby(index_in_trace_column).agg( + {constants.DEFAULT_FLOW_TIME: times_agg, cumulative_occ_path_column: "min"} + ).reset_index() + dir_follo_dataframe[activity_key] = dir_follo_dataframe[index_in_trace_column].apply( + lambda x: variant[x] + ) dir_follo_dataframe[activity_key + "_2"] = dir_follo_dataframe[index_in_trace_column].apply( - lambda x: variant[x + 1]) - dir_follo_dataframe[variant_column] = dir_follo_dataframe[index_in_trace_column].apply(lambda x: variant) + lambda x: variant[x + 1] + ) + dir_follo_dataframe[variant_column] = dir_follo_dataframe[index_in_trace_column].apply( + lambda x: variant + ) dir_follo_dataframe[variant_count] = filtered_log[case_id_key].nunique() list_to_concat.append(dir_follo_dataframe) dataframe = pandas_utils.concat(list_to_concat) dataframe[index_in_trace_column] = -dataframe[index_in_trace_column] - dataframe = dataframe.sort_values([variant_count, variant_column, index_in_trace_column], ascending=False) + dataframe = dataframe.sort_values( + [variant_count, variant_column, index_in_trace_column], ascending=False + ) dataframe[index_in_trace_column] = -dataframe[index_in_trace_column] return dataframe + def get_stochastic_language(*args, **kwargs) -> Dict[List[str], float]: """ - Gets the stochastic language from the provided object + Retrieves the stochastic language from the provided object. + + The stochastic language represents the probabilities of different traces or sequences within the process. - :param args: Pandas dataframe / event log / accepting Petri net / process tree - :param kwargs: keyword arguments - :rtype: ``Dict[List[str], float]`` + :param args: The input object, which can be a pandas DataFrame, EventLog, accepting Petri net, or ProcessTree. + :param kwargs: Additional keyword arguments. + :return: A dictionary mapping sequences of activities to their probabilities. .. code-block:: python3 import pm4py + # From an event log log = pm4py.read_xes('tests/input_data/running-example.xes') language_log = pm4py.get_stochastic_language(log) print(language_log) + + # From a Petri net net, im, fm = pm4py.read_pnml('tests/input_data/running-example.pnml') language_model = pm4py.get_stochastic_language(net, im, fm) print(language_model) """ from pm4py.statistics.variants.log import get + if isinstance(args[0], EventLog) or isinstance(args[0], EventStream) or pandas_utils.check_is_pandas_dataframe(args[0]): from pm4py.objects.conversion.log import converter as log_converter + log = log_converter.apply(args[0]) return get.get_language(log) elif isinstance(args[0], PetriNet) or isinstance(args[0], ProcessTree) or isinstance(args[0], dict): import pm4py + log = pm4py.play_out(*args, **kwargs) return get.get_language(log) else: - raise Exception("unsupported input") + raise Exception("Unsupported input type for stochastic language extraction.") -def get_minimum_self_distances(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[str, int]: - ''' - This algorithm computes the minimum self-distance for each activity observed in an event log. - The self distance of a in is infinity, of a in is 0, in is 1, etc. - The minimum self distance is the minimal observed self distance value in the event log. +def get_minimum_self_distances( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> Dict[str, int]: + """ + Computes the minimum self-distance for each activity observed in an event log. - :param log: event log (either pandas.DataFrame, EventLog or EventStream) - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[str, int]`` + The self-distance of an activity `a` in a trace is defined as follows: + - In a trace , it's infinity. + - In a trace , it's 0. + - In a trace , it's 1. + - And so on. + + The minimum self-distance for an activity is the smallest self-distance observed across all traces. + + :param log: Event log (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A dictionary mapping each activity to its minimum self-distance. .. code-block:: python3 import pm4py - msd = pm4py.get_minimum_self_distances(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') - ''' + msd = pm4py.get_minimum_self_distances( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) + """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) - - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) + + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.algo.discovery.minimum_self_distance import algorithm as msd_algo + return msd_algo.apply(log, parameters=properties) -def get_minimum_self_distance_witnesses(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[str, Set[str]]: +def get_minimum_self_distance_witnesses( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> Dict[str, Set[str]]: """ - This function derives the minimum self distance witnesses. - The self distance of a in is infinity, of a in is 0, in is 1, etc. - The minimum self distance is the minimal observed self distance value in the event log. - A 'witness' is an activity that witnesses the minimum self distance. - For example, if the minimum self distance of activity a in some log L is 2, then, - if trace is in log L, b and c are a witness of a. + Derives the minimum self-distance witnesses for each activity. - :param log: Event Log to use - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[str, Set[str]]`` + A 'witness' is an activity that occurs between two occurrences of the same activity at the minimum self-distance. + For example, if the minimum self-distance of activity `a` is 2, then in a trace , + activities `b` and `c` are witnesses of `a`. + + :param log: Event log (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A dictionary mapping each activity to a set of its witness activities. .. code-block:: python3 import pm4py - msd_wit = pm4py.get_minimum_self_distance_witnesses(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + msd_wit = pm4py.get_minimum_self_distance_witnesses( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.algo.discovery.minimum_self_distance import algorithm as msd_algo from pm4py.algo.discovery.minimum_self_distance import utils as msdw_algo - return msdw_algo.derive_msd_witnesses(log, msd_algo.apply(log, parameters=get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key))) - -def get_case_arrival_average(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> float: + return msdw_algo.derive_msd_witnesses( + log, + msd_algo.apply( + log, + parameters=get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ), + ), + ) + + +def get_case_arrival_average( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> float: """ - Gets the average difference between the start times of two consecutive cases + Calculates the average time difference between the start times of two consecutive cases. - :param log: log object - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``float`` + This metric is based on the definition: + Cycle time = Average time between completion of units. + + Example: + In a manufacturing facility producing 100 units in a 40-hour week, + the average throughput rate is 1 unit per 0.4 hours (24 minutes per unit). + Therefore, the cycle time is 24 minutes on average. + + :param log: Event log (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: The average case arrival time in the same units as the timestamp. .. code-block:: python3 import pm4py - case_arr_avg = pm4py.get_case_arrival_average(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + case_arr_avg = pm4py.get_case_arrival_average( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.statistics.traces.generic.pandas import case_arrival + return case_arrival.get_case_arrival_avg(log, parameters=properties) else: from pm4py.statistics.traces.generic.log import case_arrival + return case_arrival.get_case_arrival_avg(log, parameters=properties) -def get_rework_cases_per_activity(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[str, int]: +def get_rework_cases_per_activity( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> Dict[str, int]: """ - Find out for which activities of the log the rework (more than one occurrence in the trace for the activity) - occurs. - The output is a dictionary associating to each of the aforementioned activities - the number of cases for which the rework occurred. + Identifies activities that have rework occurrences, i.e., activities that occur more than once within the same case. + The output is a dictionary mapping each such activity to the number of cases in which rework occurred. - :param log: Log object - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[str, int]`` + :param log: Log object (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A dictionary mapping each activity with rework to the number of cases where rework occurred. .. code-block:: python3 import pm4py - rework = pm4py.get_rework_cases_per_activity(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + rework = pm4py.get_rework_cases_per_activity( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.statistics.rework.pandas import get as rework_get + return rework_get.apply(log, parameters=properties) else: from pm4py.statistics.rework.log import get as rework_get + return rework_get.apply(log, parameters=properties) -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="the get_case_overlap function will be removed in a future release.") -def get_case_overlap(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> List[int]: +@deprecation.deprecated( + deprecated_in="2.3.0", + removed_in="3.0.0", + details="The get_case_overlap function will be removed in a future release.", +) +def get_case_overlap( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> List[int]: """ - Associates to each case in the log the number of cases concurrently open + Associates each case in the log with the number of cases that are concurrently open. - :param log: Log object - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``List[int]`` + :param log: Log object (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A list where each element corresponds to a case and indicates the number of overlapping cases. .. code-block:: python3 import pm4py - overlap = pm4py.get_case_overlap(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + overlap = pm4py.get_case_overlap( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.statistics.overlap.cases.pandas import get as cases_overlap + return cases_overlap.apply(log, parameters=properties) else: from pm4py.statistics.overlap.cases.log import get as cases_overlap + return cases_overlap.apply(log, parameters=properties) -def get_cycle_time(log: Union[EventLog, pd.DataFrame], activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> float: +def get_cycle_time( + log: Union[EventLog, pd.DataFrame], + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> float: """ Calculates the cycle time of the event log. - The definition that has been followed is the one proposed in: - https://www.presentationeze.com/presentations/lean-manufacturing-just-in-time/lean-manufacturing-just-in-time-full-details/process-cycle-time-analysis/calculate-cycle-time/#:~:text=Cycle%20time%20%3D%20Average%20time%20between,is%2024%20minutes%20on%20average. - - So: - Cycle time = Average time between completion of units. + Cycle time is defined as the average time between the completion of units. - Example taken from the website: - Consider a manufacturing facility, which is producing 100 units of product per 40 hour week. - The average throughput rate is 1 unit per 0.4 hours, which is one unit every 24 minutes. - Therefore the cycle time is 24 minutes on average. + Example: + In a manufacturing facility producing 100 units in a 40-hour week, + the average throughput rate is 1 unit per 0.4 hours (24 minutes per unit). + Therefore, the cycle time is 24 minutes on average. - :param log: Log object - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``float`` + :param log: Event log (EventLog or pandas DataFrame). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: The cycle time as a float. .. code-block:: python3 import pm4py - cycle_time = pm4py.get_cycle_time(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + cycle_time = pm4py.get_cycle_time( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.statistics.traces.cycle_time.pandas import get as cycle_time + return cycle_time.apply(log, parameters=properties) else: from pm4py.statistics.traces.cycle_time.log import get as cycle_time + return cycle_time.apply(log, parameters=properties) -def get_service_time(log: Union[EventLog, pd.DataFrame], aggregation_measure: str = "mean", activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", start_timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[str, float]: +def get_service_time( + log: Union[EventLog, pd.DataFrame], + aggregation_measure: str = "mean", + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + start_timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> Dict[str, float]: """ - Gets the activities' (average/median/...) service time in the provided event log + Computes the service time for each activity in the event log using the specified aggregation measure. + + Service time refers to the duration an activity takes within a case. - :param log: event log - :param aggregation_measure: the aggregation to be used (mean, median, min, max, sum) - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param start_timestamp_key: attribute to be used for the start timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[str, float]`` + :param log: Event log (EventLog or pandas DataFrame). + :param aggregation_measure: Aggregation function to apply (e.g., "mean", "median", "min", "max", "sum"). + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param start_timestamp_key: Attribute to be used for the start timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A dictionary mapping each activity to its aggregated service time. .. code-block:: python3 import pm4py log = pm4py.read_xes('tests/input_data/interval_event_log.xes') - mean_serv_time = pm4py.get_service_time(log, start_timestamp_key='start_timestamp', aggregation_measure='mean') + mean_serv_time = pm4py.get_service_time( + log, + start_timestamp_key='start_timestamp', + aggregation_measure='mean' + ) print(mean_serv_time) - median_serv_time = pm4py.get_service_time(log, start_timestamp_key='start_timestamp', aggregation_measure='median') + + median_serv_time = pm4py.get_service_time( + log, + start_timestamp_key='start_timestamp', + aggregation_measure='median' + ) print(median_serv_time) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key, start_timestamp_key=start_timestamp_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + start_timestamp_key=start_timestamp_key, + ) properties["aggregationMeasure"] = aggregation_measure if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key, start_timestamp_key=start_timestamp_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + start_timestamp_key=start_timestamp_key, + ) from pm4py.statistics.service_time.pandas import get as serv_time_get + return serv_time_get.apply(log, parameters=properties) else: from pm4py.statistics.service_time.log import get as serv_time_get + return serv_time_get.apply(log, parameters=properties) -def get_all_case_durations(log: Union[EventLog, pd.DataFrame], business_hours: bool = False, business_hour_slots=constants.DEFAULT_BUSINESS_HOUR_SLOTS, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> List[float]: +def get_all_case_durations( + log: Union[EventLog, pd.DataFrame], + business_hours: bool = False, + business_hour_slots=constants.DEFAULT_BUSINESS_HOUR_SLOTS, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> List[float]: """ - Gets the durations of the cases in the event log - - :param log: Event log - :param business_hours: Enables/disables the computation based on the business hours (default: False) - :param business_hour_slots: work schedule of the company, provided as a list of tuples where each tuple represents one time slot of business hours. One slot i.e. one tuple consists of one start and one end time given in seconds since week start, e.g. [(7 * 60 * 60, 17 * 60 * 60), ((24 + 7) * 60 * 60, (24 + 12) * 60 * 60), ((24 + 13) * 60 * 60, (24 + 17) * 60 * 60),] meaning that business hours are Mondays 07:00 - 17:00 and Tuesdays 07:00 - 12:00 and 13:00 - 17:00 - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``List[float]`` + Retrieves the durations of all cases in the event log. + + :param log: Event log (EventLog or pandas DataFrame). + :param business_hours: If True, computes durations based on business hours; otherwise, uses calendar time. + :param business_hour_slots: Work schedule of the company as a list of tuples. Each tuple represents a time slot in seconds since the week start. + Example: [ + (7 * 60 * 60, 17 * 60 * 60), + ((24 + 7) * 60 * 60, (24 + 12) * 60 * 60), + ((24 + 13) * 60 * 60, (24 + 17) * 60 * 60), + ] + This example means: + - Monday 07:00 - 17:00 + - Tuesday 07:00 - 12:00 + - Tuesday 13:00 - 17:00 + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A sorted list of case durations. .. code-block:: python3 import pm4py - case_durations = pm4py.get_all_case_durations(dataframe, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + case_durations = pm4py.get_all_case_durations( + dataframe, + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) properties["business_hours"] = business_hours properties["business_hour_slots"] = business_hour_slots + if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.statistics.traces.generic.pandas import case_statistics + cd = case_statistics.get_cases_description(log, parameters=properties) return sorted([x["caseDuration"] for x in cd.values()]) else: from pm4py.statistics.traces.generic.log import case_statistics + return case_statistics.get_all_case_durations(log, parameters=properties) -def get_case_duration(log: Union[EventLog, pd.DataFrame], case_id: str, business_hours: bool = False, business_hour_slots=constants.DEFAULT_BUSINESS_HOUR_SLOTS, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: Optional[str] = None) -> float: +def get_case_duration( + log: Union[EventLog, pd.DataFrame], + case_id: str, + business_hours: bool = False, + business_hour_slots=constants.DEFAULT_BUSINESS_HOUR_SLOTS, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: Optional[str] = None, +) -> float: """ - Gets the duration of a specific case - - :param log: Event log - :param case_id: Case identifier - :param business_hours: Enables/disables the computation based on the business hours (default: False) - :param business_hour_slots: work schedule of the company, provided as a list of tuples where each tuple represents one time slot of business hours. One slot i.e. one tuple consists of one start and one end time given in seconds since week start, e.g. [(7 * 60 * 60, 17 * 60 * 60), ((24 + 7) * 60 * 60, (24 + 12) * 60 * 60), ((24 + 13) * 60 * 60, (24 + 17) * 60 * 60),] meaning that business hours are Mondays 07:00 - 17:00 and Tuesdays 07:00 - 12:00 and 13:00 - 17:00 - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``float`` + Retrieves the duration of a specific case. + + :param log: Event log (EventLog or pandas DataFrame). + :param case_id: Identifier of the case whose duration is to be retrieved. + :param business_hours: If True, computes duration based on business hours; otherwise, uses calendar time. + :param business_hour_slots: Work schedule of the company as a list of tuples. Each tuple represents a time slot in seconds since the week start. + Example: [ + (7 * 60 * 60, 17 * 60 * 60), + ((24 + 7) * 60 * 60, (24 + 12) * 60 * 60), + ((24 + 13) * 60 * 60, (24 + 17) * 60 * 60), + ] + This example means: + - Monday 07:00 - 17:00 + - Tuesday 07:00 - 12:00 + - Tuesday 13:00 - 17:00 + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: The duration of the specified case. .. code-block:: python3 import pm4py - duration = pm4py.get_case_duration(dataframe, 'case 1', activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + duration = pm4py.get_case_duration( + dataframe, + 'case_1', + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) - properties = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + properties = get_properties( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) properties["business_hours"] = business_hours properties["business_hour_slots"] = business_hour_slots + if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) from pm4py.statistics.traces.generic.pandas import case_statistics + cd = case_statistics.get_cases_description(log, parameters=properties) return cd[case_id]["caseDuration"] else: from pm4py.statistics.traces.generic.log import case_statistics + cd = case_statistics.get_cases_description(log, parameters=properties) return cd[case_id]["caseDuration"] -def get_frequent_trace_segments(log: Union[EventLog, pd.DataFrame], min_occ: int, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> TCounter: +def get_frequent_trace_segments( + log: Union[EventLog, pd.DataFrame], + min_occ: int, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> TCounter: """ - Get the traces (segments of activities) from an event log object. - Each trace is preceded and followed by "...", reminding that the trace/segment - can be preceded and followed by any other set of activities. + Retrieves frequent trace segments (sub-sequences of activities) from an event log. + Each trace segment is preceded and followed by "...", indicating that it can be part of a larger sequence. - :param log: event log - :param min_occ: minimum number of occurrence of a trace in order to be included - :param activity_key: the attribute to be used as activity - :param timestamp_key: the attribute to be used as timestamp - :param case_id_key: the attribute to be used as case identifier (for Pandas dataframes) - :rtype: ``TCounter`` + :param log: Event log (EventLog or pandas DataFrame). + :param min_occ: Minimum number of occurrences for a trace segment to be included. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A Counter object mapping trace segments to their occurrence counts. .. code-block:: python3 @@ -746,12 +1179,19 @@ def get_frequent_trace_segments(log: Union[EventLog, pd.DataFrame], min_occ: int __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) import pm4py.utils from prefixspan import PrefixSpan - projection = pm4py.utils.project_on_event_attribute(log, attribute_key=activity_key, case_id_key=case_id_key) + projection = pm4py.utils.project_on_event_attribute( + log, attribute_key=activity_key, case_id_key=case_id_key + ) traces0 = PrefixSpan(projection).frequent(min_occ) traces = {} for x in traces0: @@ -768,32 +1208,51 @@ def get_frequent_trace_segments(log: Union[EventLog, pd.DataFrame], min_occ: int return traces -def get_activity_position_summary(log: Union[EventLog, pd.DataFrame], activity: str, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name") -> Dict[int, int]: +def get_activity_position_summary( + log: Union[EventLog, pd.DataFrame], + activity: str, + activity_key: str = "concept:name", + timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", +) -> Dict[int, int]: """ - Given an event log, returns a dictionary which summarize the positions - of the activities in the different cases of the event log. - E.g., if an activity happens 1000 times in the position 1 (the second event of a case), - and 500 times in the position 2 (the third event of a case), then the returned dictionary would be: - {1: 1000, 2: 500} + Summarizes the positions of a specific activity across all cases in the event log. + + For each occurrence of the activity, records its position within the trace. + For example, if 'A' occurs 1000 times in position 1 and 500 times in position 2, + the returned dictionary will be {1: 1000, 2: 500}. - :param log: Event log object / Pandas dataframe - :param activity: Activity to consider - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param case_id_key: attribute to be used as case identifier - :rtype: ``Dict[int, int]`` + :param log: Event log object (EventLog or pandas DataFrame). + :param activity: The activity to analyze. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param case_id_key: Attribute to be used as the case identifier. + :return: A dictionary mapping positions (0-based index) to the number of times the activity occurs in that position. .. code-block:: python3 import pm4py - act_pos = pm4py.get_activity_position_summary(dataframe, 'Act. A', activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp') + act_pos = pm4py.get_activity_position_summary( + dataframe, + 'Act. A', + activity_key='concept:name', + case_id_key='case:concept:name', + timestamp_key='time:timestamp' + ) """ __event_log_deprecation_warning(log) if check_is_pandas_dataframe(log): - check_pandas_dataframe_columns(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key) - log = insert_ev_in_tr_index(log, case_id_key, "@@index_in_trace") + check_pandas_dataframe_columns( + log, + activity_key=activity_key, + timestamp_key=timestamp_key, + case_id_key=case_id_key, + ) + log = insert_ev_in_tr_index( + log, case_id_key, "@@index_in_trace" + ) ret = log[log[activity_key] == activity]["@@index_in_trace"].value_counts().to_dict() return ret else: diff --git a/pm4py/utils.py b/pm4py/utils.py index cc8167ef3..621861e31 100644 --- a/pm4py/utils.py +++ b/pm4py/utils.py @@ -43,20 +43,21 @@ def format_dataframe(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAME, - activity_key: str = xes_constants.DEFAULT_NAME_KEY, - timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY, - start_timestamp_key: str = xes_constants.DEFAULT_START_TIMESTAMP_KEY, - timest_format: Optional[str] = None) -> pd.DataFrame: + activity_key: str = xes_constants.DEFAULT_NAME_KEY, + timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY, + start_timestamp_key: str = xes_constants.DEFAULT_START_TIMESTAMP_KEY, + timest_format: Optional[str] = None) -> pd.DataFrame: """ - Give the appropriate format on the dataframe, for process mining purposes + Formats the dataframe appropriately for process mining purposes. - :param df: Dataframe - :param case_id: Case identifier column - :param activity_key: Activity column - :param timestamp_key: Timestamp column - :param start_timestamp_key: Start timestamp column - :param timest_format: Timestamp format that is provided to Pandas - :rtype: ``pd.DataFrame`` + :param df: Dataframe. + :param case_id: Case identifier column. + :param activity_key: Activity column. + :param timestamp_key: Timestamp column. + :param start_timestamp_key: Start timestamp column. + :param timest_format: Timestamp format provided to Pandas. + :return: A formatted pandas DataFrame. + :rtype: pd.DataFrame .. code-block:: python3 @@ -64,7 +65,14 @@ def format_dataframe(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAM import pm4py dataframe = pd.read_csv('event_log.csv') - dataframe = pm4py.format_dataframe(dataframe, case_id_key='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp', start_timestamp_key='start_timestamp', timest_format='%Y-%m-%d %H:%M:%S') + dataframe = pm4py.format_dataframe( + dataframe, + case_id='case:concept:name', + activity_key='concept:name', + timestamp_key='time:timestamp', + start_timestamp_key='start_timestamp', + timest_format='%Y-%m-%d %H:%M:%S' + ) """ if timest_format is None: timest_format = constants.DEFAULT_TIMESTAMP_PARSE_FORMAT @@ -88,33 +96,39 @@ def format_dataframe(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAM if xes_constants.DEFAULT_TIMESTAMP_KEY in df.columns: del df[xes_constants.DEFAULT_TIMESTAMP_KEY] df[xes_constants.DEFAULT_TIMESTAMP_KEY] = df[timestamp_key] - # makes sure that the timestamps column are of timestamp type + # Makes sure that the timestamps column are of timestamp type df = dataframe_utils.convert_timestamp_columns_in_df(df, timest_format=timest_format) - # drop NaN(s) in the main columns (case ID, activity, timestamp) to ensure functioning of the - # algorithms + # Drop NaN(s) in the main columns (case ID, activity, timestamp) to ensure functioning of the algorithms prev_length = len(df) - df = df.dropna(subset={constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_NAME_KEY, - xes_constants.DEFAULT_TIMESTAMP_KEY}, how="any") + df = df.dropna(subset={ + constants.CASE_CONCEPT_NAME, + xes_constants.DEFAULT_NAME_KEY, + xes_constants.DEFAULT_TIMESTAMP_KEY + }, how="any") if len(df) < prev_length: if constants.SHOW_INTERNAL_WARNINGS: warnings.warn("Some rows of the Pandas data frame have been removed because of empty case IDs, activity labels, or timestamps to ensure the correct functioning of PM4Py's algorithms.") - # make sure the case ID column is of string type + # Make sure the case ID column is of string type df[constants.CASE_CONCEPT_NAME] = df[constants.CASE_CONCEPT_NAME].astype("string") - # make sure the activity column is of string type + # Make sure the activity column is of string type df[xes_constants.DEFAULT_NAME_KEY] = df[xes_constants.DEFAULT_NAME_KEY].astype("string") - # set an index column + # Set an index column df = pandas_utils.insert_index(df, INDEX_COLUMN, copy_dataframe=False) - # sorts the dataframe - df = df.sort_values([constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_TIMESTAMP_KEY, INDEX_COLUMN]) - # re-set the index column + # Sorts the dataframe + df = df.sort_values([ + constants.CASE_CONCEPT_NAME, + xes_constants.DEFAULT_TIMESTAMP_KEY, + INDEX_COLUMN + ]) + # Re-set the index column df = pandas_utils.insert_index(df, INDEX_COLUMN, copy_dataframe=False) - # sets the index column in the dataframe + # Sets the index column in the dataframe df = pandas_utils.insert_case_index(df, CASE_INDEX_COLUMN, copy_dataframe=False) - # sets the properties + # Sets the properties if not hasattr(df, 'attrs'): - # legacy (Python 3.6) support + # Legacy (Python 3.6) support df.attrs = {} if start_timestamp_key in df.columns: df[xes_constants.DEFAULT_START_TIMESTAMP_KEY] = df[start_timestamp_key] @@ -129,25 +143,33 @@ def format_dataframe(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAM def rebase(log_obj: Union[EventLog, EventStream, pd.DataFrame], case_id: str = constants.CASE_CONCEPT_NAME, - activity_key: str = xes_constants.DEFAULT_NAME_KEY, - timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY, - start_timestamp_key: str = xes_constants.DEFAULT_START_TIMESTAMP_KEY, timest_format: Optional[str] = None) -> Union[EventLog, EventStream, pd.DataFrame]: + activity_key: str = xes_constants.DEFAULT_NAME_KEY, + timestamp_key: str = xes_constants.DEFAULT_TIMESTAMP_KEY, + start_timestamp_key: str = xes_constants.DEFAULT_START_TIMESTAMP_KEY, timest_format: Optional[str] = None) -> Union[EventLog, EventStream, pd.DataFrame]: """ - Re-base the log object, changing the case ID, activity and timestamp attributes. + Re-bases the log object by changing the case ID, activity, and timestamp attributes. - :param log_obj: Log object - :param case_id: Case identifier - :param activity_key: Activity - :param timestamp_key: Timestamp - :param start_timestamp_key: Start timestamp - :param timest_format: Timestamp format that is provided to Pandas - :rtype: ``Union[EventLog, EventStream, pd.DataFrame]`` + :param log_obj: Log object. + :param case_id: Case identifier. + :param activity_key: Activity. + :param timestamp_key: Timestamp. + :param start_timestamp_key: Start timestamp. + :param timest_format: Timestamp format provided to Pandas. + :return: A re-based log object. + :rtype: Union[EventLog, EventStream, pd.DataFrame]. .. code-block:: python3 import pm4py - rebased_dataframe = pm4py.rebase(dataframe, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp') + rebased_dataframe = pm4py.rebase( + dataframe, + case_id='case:concept:name', + activity_key='concept:name', + timestamp_key='time:timestamp', + start_timestamp_key='start_timestamp', + timest_format='%Y-%m-%d %H:%M:%S' + ) """ import pm4py @@ -157,33 +179,52 @@ def rebase(log_obj: Union[EventLog, EventStream, pd.DataFrame], case_id: str = c check_pandas_dataframe_columns(log_obj) if check_is_pandas_dataframe(log_obj): - return format_dataframe(log_obj, case_id=case_id, activity_key=activity_key, timestamp_key=timestamp_key, - start_timestamp_key=start_timestamp_key, timest_format=timest_format) + return format_dataframe( + log_obj, + case_id=case_id, + activity_key=activity_key, + timestamp_key=timestamp_key, + start_timestamp_key=start_timestamp_key, + timest_format=timest_format + ) elif isinstance(log_obj, EventLog): log_obj = pm4py.convert_to_dataframe(log_obj) - log_obj = format_dataframe(log_obj, case_id=case_id, activity_key=activity_key, timestamp_key=timestamp_key, - start_timestamp_key=start_timestamp_key, timest_format=timest_format) + log_obj = format_dataframe( + log_obj, + case_id=case_id, + activity_key=activity_key, + timestamp_key=timestamp_key, + start_timestamp_key=start_timestamp_key, + timest_format=timest_format + ) from pm4py.objects.conversion.log import converter return converter.apply(log_obj, variant=converter.Variants.TO_EVENT_LOG) elif isinstance(log_obj, EventStream): log_obj = pm4py.convert_to_dataframe(log_obj) - log_obj = format_dataframe(log_obj, case_id=case_id, activity_key=activity_key, timestamp_key=timestamp_key, - start_timestamp_key=start_timestamp_key, timest_format=timest_format) + log_obj = format_dataframe( + log_obj, + case_id=case_id, + activity_key=activity_key, + timestamp_key=timestamp_key, + start_timestamp_key=start_timestamp_key, + timest_format=timest_format + ) return pm4py.convert_to_event_stream(log_obj) def parse_process_tree(tree_string: str) -> ProcessTree: """ - Parse a process tree from a string + Parses a process tree from a string. - :param tree_string: String representing a process tree (e.g. '-> ( 'A', O ( 'B', 'C' ), 'D' )'). Operators are '->': sequence, '+': parallel, 'X': xor choice, '*': binary loop, 'O' or choice - :rtype: ``ProcessTree`` + :param tree_string: String representing a process tree (e.g., "-> ( 'A', O ( 'B', 'C' ), 'D' )"). Operators are '->' for sequence, '+' for parallel, 'X' for XOR choice, '*' for binary loop, and 'O' for choice. + :return: A ProcessTree object. + :rtype: ProcessTree .. code-block:: python3 import pm4py - process_tree = pm4py.parse_process_tree('-> ( 'A', O ( 'B', 'C' ), 'D' )') + process_tree = pm4py.parse_process_tree("-> ( 'A', O ( 'B', 'C' ), 'D' )") """ from pm4py.objects.process_tree.utils.generic import parse return parse(tree_string) @@ -191,26 +232,19 @@ def parse_process_tree(tree_string: str) -> ProcessTree: def parse_powl_model_string(powl_string: str) -> POWL: """ - Parse a POWL model from a string representation of the process model - (with the same format as the __repr__ and __str__ methods of the POWL model) + Parses a POWL model from a string representation of the process model + (with the same format as the __repr__ and __str__ methods of the POWL model). - :param powl_string: POWL model expressed as a string (__repr__ of the POWL model) - :rtype: ``POWL`` + :param powl_string: POWL model expressed as a string (__repr__ of the POWL model). + :return: A POWL object. + :rtype: POWL .. code-block:: python3 import pm4py - powl_model = pm4py.parse_powl_model_string('PO=(nodes={ NODE1, NODE2, NODE3 }, order={ NODE1-->NODE2 }') + powl_model = pm4py.parse_powl_model_string('PO=(nodes={ NODE1, NODE2, NODE3 }, order={ NODE1-->NODE2 })') print(powl_model) - - Parameters - ---------- - powl_string - - Returns - ------- - """ from pm4py.objects.powl import parser return parser.parse_powl_model_string(powl_string) @@ -218,10 +252,17 @@ def parse_powl_model_string(powl_string: str) -> POWL: def serialize(*args) -> Tuple[str, bytes]: """ - Serialize a PM4Py object into a bytes string - - :param args: A PM4Py object, among: - an EventLog object - a Pandas dataframe object - a (Petrinet, Marking, Marking) tuple - a ProcessTree object - a BPMN object - a DFG, including the dictionary of the directly-follows relations, the start activities and the end activities - :rtype: ``Tuple[str, bytes]`` + Serializes a PM4Py object into a bytes string. + + :param args: PM4Py object(s) to serialize. Supported types include: + - An EventLog object. + - A Pandas DataFrame object. + - A tuple consisting of (PetriNet, Marking, Marking). + - A ProcessTree object. + - A BPMN object. + - A DFG, including the dictionary of directly-follows relations, start activities, and end activities. + :return: A tuple containing the serialization type as a string and the serialized bytes. + :rtype: Tuple[str, bytes] .. code-block:: python3 @@ -255,16 +296,25 @@ def serialize(*args) -> Tuple[str, bytes]: return (constants.AvailableSerializations.BPMN.value, bpmn_exporter.serialize(*args)) elif len(args) == 3 and (isinstance(args[0], dict) or isinstance(args[0], Counter)): from pm4py.objects.dfg.exporter import exporter as dfg_exporter - return (constants.AvailableSerializations.DFG.value, - dfg_exporter.serialize(args[0], parameters={"start_activities": args[1], "end_activities": args[2]})) + return ( + constants.AvailableSerializations.DFG.value, + dfg_exporter.serialize( + args[0], + parameters={ + "start_activities": args[1], + "end_activities": args[2] + } + ) + ) def deserialize(ser_obj: Tuple[str, bytes]) -> Any: """ - Deserialize a bytes string to a PM4Py object + Deserializes a bytes string back into a PM4Py object. - :param ser_obj: Serialized object (a tuple consisting of a string denoting the type of the object, and a bytes string representing the serialization) - :rtype: ``Any`` + :param ser_obj: Serialized object as a tuple, consisting of a string indicating the type of the object and a bytes string representing the serialization. + :return: The deserialized PM4Py object. + :rtype: Any .. code-block:: python3 @@ -297,24 +347,28 @@ def deserialize(ser_obj: Tuple[str, bytes]) -> Any: return dfg_importer.deserialize(ser_obj[1]) -def get_properties(log, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", case_id_key: str = "case:concept:name", resource_key: str = "org:resource", group_key: Optional[str] = None, start_timestamp_key: Optional[str] = None, **kwargs): +def get_properties(log, activity_key: str = "concept:name", timestamp_key: str = "time:timestamp", + case_id_key: str = "case:concept:name", resource_key: str = "org:resource", + group_key: Optional[str] = None, start_timestamp_key: Optional[str] = None, + **kwargs): """ - Gets the properties from a log object - - :param log: Log object - :param activity_key: attribute to be used for the activity - :param timestamp_key: attribute to be used for the timestamp - :param start_timestamp_key: (optional) attribute to be used for the start timestamp - :param case_id_key: attribute to be used as case identifier - :param resource_key: (if provided) attribute to be used as resource - :param group_key: (if provided) attribute to be used as group identifier - :rtype: ``Dict`` + Retrieves the properties from a log object. + + :param log: Log object. + :param activity_key: Attribute to be used for the activity. + :param timestamp_key: Attribute to be used for the timestamp. + :param start_timestamp_key: (Optional) Attribute to be used for the start timestamp. + :param case_id_key: Attribute to be used as case identifier. + :param resource_key: (Optional) Attribute to be used as resource. + :param group_key: (Optional) Attribute to be used as group identifier. + :param kwargs: Additional keyword arguments. + :return: A dictionary of properties. + :rtype: Dict """ __event_log_deprecation_warning(log) from copy import copy - parameters = copy(log.properties) if hasattr(log, 'properties') else copy(log.attrs) if hasattr(log, - 'attrs') else {} + parameters = copy(log.properties) if hasattr(log, 'properties') else copy(log.attrs) if hasattr(log, 'attrs') else {} if activity_key is not None: parameters[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key @@ -341,16 +395,23 @@ def get_properties(log, activity_key: str = "concept:name", timestamp_key: str = return parameters -@deprecation.deprecated(deprecated_in="2.3.0", removed_in="3.0.0", details="this method will be removed in a future release." - "Please use the method-specific arguments.") +@deprecation.deprecated( + deprecated_in="2.3.0", + removed_in="3.0.0", + details="This method will be removed in a future release. Please use the method-specific arguments." +) def set_classifier(log, classifier, classifier_attribute=constants.DEFAULT_CLASSIFIER_ATTRIBUTE): """ - Methods to set the specified classifier on an existing event log - - :param log: Log object - :param classifier: Classifier that should be set: - A list of event attributes can be provided - A single event attribute can be provided - A classifier stored between the "classifiers" of the log object can be provided - :param classifier_attribute: The attribute of the event that should store the concatenation of the attribute values for the given classifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + Sets the specified classifier on an existing event log. + + :param log: Log object. + :param classifier: Classifier to set. It can be: + - A list of event attributes. + - A single event attribute. + - A classifier stored in the "classifiers" of the log object. + :param classifier_attribute: The attribute of the event that will store the concatenation of the attribute values for the given classifier. + :return: The updated log object as an EventLog or Pandas DataFrame. + :rtype: Union[EventLog, pd.DataFrame] """ __event_log_deprecation_warning(log) @@ -365,7 +426,7 @@ def set_classifier(log, classifier, classifier_attribute=constants.DEFAULT_CLASS if type(log) is EventLog: for trace in log: for event in trace: - event[classifier_attribute] = "+".join(list(event[x] for x in classifier)) + event[classifier_attribute] = "+".join([str(event[x]) for x in classifier]) log.properties[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = classifier_attribute log.properties[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = classifier_attribute elif pandas_utils.check_is_pandas_dataframe(log): @@ -375,7 +436,7 @@ def set_classifier(log, classifier, classifier_attribute=constants.DEFAULT_CLASS log.attrs[constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = classifier_attribute log.attrs[constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = classifier_attribute else: - raise Exception("setting classifier is not defined for this class of objects") + raise Exception("Setting classifier is not defined for this class of objects") return log @@ -386,17 +447,16 @@ def parse_event_log_string(traces: Collection[str], sep: str = ",", case_id_key: str = constants.CASE_CONCEPT_NAME, return_legacy_log_object: bool = constants.DEFAULT_READ_XES_LEGACY_OBJECT) -> Union[EventLog, pd.DataFrame]: """ - Parse a collection of traces expressed as strings - (e.g., ["A,B,C,D", "A,C,B,D", "A,D"]) - to a log object (Pandas dataframe) - - :param traces: Collection of traces expressed as strings - :param sep: Separator used to split the activities of a string trace - :param activity_key: The attribute that should be used as activity - :param timestamp_key: The attribute that should be used as timestamp - :param case_id_key: The attribute that should be used as case identifier - :param return_legacy_log_object: boolean value enabling returning a log object (default: False) - :rtype: ``pd.DataFrame`` + Parses a collection of traces expressed as strings (e.g., ["A,B,C,D", "A,C,B,D", "A,D"]) into a log object. + + :param traces: Collection of traces expressed as strings. + :param sep: Separator used to split the activities in a string trace. + :param activity_key: The attribute to be used as activity. + :param timestamp_key: The attribute to be used as timestamp. + :param case_id_key: The attribute to be used as case identifier. + :param return_legacy_log_object: If True, returns a legacy log object (EventLog). If False, returns a Pandas DataFrame. Default is False. + :return: A log object, either as a legacy EventLog or a Pandas DataFrame. + :rtype: Union[EventLog, pd.DataFrame] .. code-block:: python3 @@ -415,9 +475,13 @@ def parse_event_log_string(traces: Collection[str], sep: str = ",", cases.append(str(index)) activitiess.append(act) timestamps.append(strpfromiso.fix_naivety(datetime.datetime.fromtimestamp(this_timest))) - this_timest = this_timest + 1 + this_timest += 1 - dataframe = pandas_utils.instantiate_dataframe({case_id_key: cases, activity_key: activitiess, timestamp_key: timestamps}) + dataframe = pandas_utils.instantiate_dataframe({ + case_id_key: cases, + activity_key: activitiess, + timestamp_key: timestamps + }) if return_legacy_log_object: import pm4py @@ -427,27 +491,34 @@ def parse_event_log_string(traces: Collection[str], sep: str = ",", return dataframe -def project_on_event_attribute(log: Union[EventLog, pd.DataFrame], attribute_key=xes_constants.DEFAULT_NAME_KEY, case_id_key=None) -> \ -List[List[str]]: +def project_on_event_attribute(log: Union[EventLog, pd.DataFrame], attribute_key=xes_constants.DEFAULT_NAME_KEY, case_id_key=None) -> List[List[str]]: """ - Project the event log on a specified event attribute. The result is a list, containing a list for each case: - all the cases are transformed to list of values for the specified attribute. + Projects the event log onto a specified event attribute. The result is a list containing a list for each case, where each case is represented as a list of values for the specified attribute. - Example: + **Example**: + ```python pm4py.project_on_event_attribute(log, "concept:name") - - [['register request', 'examine casually', 'check ticket', 'decide', 'reinitiate request', 'examine thoroughly', 'check ticket', 'decide', 'pay compensation'], - ['register request', 'check ticket', 'examine casually', 'decide', 'pay compensation'], - ['register request', 'examine thoroughly', 'check ticket', 'decide', 'reject request'], - ['register request', 'examine casually', 'check ticket', 'decide', 'pay compensation'], - ['register request', 'examine casually', 'check ticket', 'decide', 'reinitiate request', 'check ticket', 'examine casually', 'decide', 'reinitiate request', 'examine casually', 'check ticket', 'decide', 'reject request'], - ['register request', 'check ticket', 'examine thoroughly', 'decide', 'reject request']] - - :param log: Event log / Pandas dataframe - :param attribute_key: The attribute to be used - :param case_id_key: The attribute to be used as case identifier - :rtype: ``List[List[str]]`` + ``` + + **Output**: + + ```python + [ + ['register request', 'examine casually', 'check ticket', 'decide', 'reinitiate request', 'examine thoroughly', 'check ticket', 'decide', 'pay compensation'], + ['register request', 'check ticket', 'examine casually', 'decide', 'pay compensation'], + ['register request', 'examine thoroughly', 'check ticket', 'decide', 'reject request'], + ['register request', 'examine casually', 'check ticket', 'decide', 'pay compensation'], + ['register request', 'examine casually', 'check ticket', 'decide', 'reinitiate request', 'check ticket', 'examine casually', 'decide', 'reinitiate request', 'examine casually', 'check ticket', 'decide', 'reject request'], + ['register request', 'check ticket', 'examine thoroughly', 'decide', 'reject request'] + ] + ``` + + :param log: Event log or Pandas DataFrame. + :param attribute_key: The attribute to be used for projection. + :param case_id_key: (Optional) The attribute to be used as case identifier. + :return: A list of lists containing the projected attribute values for each case. + :rtype: List[List[str]] .. code-block:: python3 @@ -466,21 +537,28 @@ def project_on_event_attribute(log: Union[EventLog, pd.DataFrame], attribute_key parameters[from_pandas.Parameters.CASE_ID_KEY] = case_id_key it = from_pandas.apply(log, parameters=parameters) for trace in it: - output.append([x[xes_constants.DEFAULT_NAME_KEY] if xes_constants.DEFAULT_NAME_KEY is not None else None for x in trace]) + output.append([ + x[xes_constants.DEFAULT_NAME_KEY] if xes_constants.DEFAULT_NAME_KEY is not None else None + for x in trace + ]) else: for trace in log: - output.append([x[attribute_key] if attribute_key is not None else None for x in trace]) + output.append([ + x[attribute_key] if attribute_key is not None else None + for x in trace + ]) return output def sample_cases(log: Union[EventLog, pd.DataFrame], num_cases: int, case_id_key: str = "case:concept:name") -> Union[EventLog, pd.DataFrame]: """ - (Random) Sample a given number of cases from the event log. + Randomly samples a given number of cases from the event log. - :param log: Event log / Pandas dataframe - :param num_cases: Number of cases to sample - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventLog, pd.DataFrame]`` + :param log: Event log or Pandas DataFrame. + :param num_cases: Number of cases to sample. + :param case_id_key: Attribute to be used as case identifier. + :return: A sampled log object, either as an EventLog or a Pandas DataFrame. + :rtype: Union[EventLog, pd.DataFrame] .. code-block:: python3 @@ -506,12 +584,12 @@ def sample_cases(log: Union[EventLog, pd.DataFrame], num_cases: int, case_id_key def sample_events(log: Union[EventStream, OCEL], num_events: int) -> Union[EventStream, OCEL, pd.DataFrame]: """ - (Random) Sample a given number of events from the event log. + Randomly samples a given number of events from the event log. - :param log: Event stream / OCEL / Pandas dataframes - :param num_events: Number of events to sample - :param case_id_key: attribute to be used as case identifier - :rtype: ``Union[EventStream, OCEL, pd.DataFrame]`` + :param log: Event stream, OCEL, or Pandas DataFrame. + :param num_events: Number of events to sample. + :return: A sampled log object, either as an EventStream, OCEL, or Pandas DataFrame. + :rtype: Union[EventStream, OCEL, pd.DataFrame] .. code-block:: python3 @@ -541,11 +619,11 @@ def __event_log_deprecation_warning(log): if constants.SHOW_EVENT_LOG_DEPRECATION and not hasattr(log, "deprecation_warning_shown"): if constants.SHOW_INTERNAL_WARNINGS: if isinstance(log, EventLog) or isinstance(log, Trace): - warnings.warn("the EventLog class has been deprecated and will be removed in a future release.") + warnings.warn("The EventLog class has been deprecated and will be removed in a future release.") log.deprecation_warning_shown = True elif isinstance(log, Trace): - warnings.warn("the Trace class has been deprecated and will be removed in a future release.") + warnings.warn("The Trace class has been deprecated and will be removed in a future release.") log.deprecation_warning_shown = True elif isinstance(log, EventStream): - warnings.warn("the EventStream class has been deprecated and will be removed in a future release.") + warnings.warn("The EventStream class has been deprecated and will be removed in a future release.") log.deprecation_warning_shown = True diff --git a/pm4py/write.py b/pm4py/write.py index 87715168b..aca223a4b 100644 --- a/pm4py/write.py +++ b/pm4py/write.py @@ -20,7 +20,7 @@ Contact: info@processintelligence.solutions ''' __doc__ = """ -The ``pm4py.write`` module contains all funcationality related to writing files/objects to disk. +The ``pm4py.write`` module contains all functionality related to writing files/objects to disk. """ from pm4py.objects.bpmn.obj import BPMN @@ -35,16 +35,16 @@ from pm4py.util.pandas_utils import check_is_pandas_dataframe, check_pandas_dataframe_columns -def write_xes(log: Union[EventLog, pd.DataFrame], file_path: str, case_id_key: str = "case:concept:name", extensions = None, encoding: str = constants.DEFAULT_ENCODING, **kwargs) -> None: +def write_xes(log: Union[EventLog, pd.DataFrame], file_path: str, case_id_key: str = "case:concept:name", extensions=None, encoding: str = constants.DEFAULT_ENCODING, **kwargs) -> None: """ - Writes an event log to disk in the XES format (see `xes-standard `_) - - :param log: log object (``pandas.DataFrame``) that needs to be written to disk - :param file_path: target file path of the event log (``.xes`` file) on disk - :param case_id_key: column key that identifies the case identifier - :param extensions: extensions defined for the event log - :param encoding: the encoding to be used (default: utf-8) - + Writes an event log to disk in the XES format (see `xes-standard `_). + + :param log: Log object (``EventLog`` or ``pandas.DataFrame``) that needs to be written to disk. + :param file_path: Target file path of the event log (``.xes`` file) on disk. + :param case_id_key: Column key that identifies the case identifier. + :param extensions: Extensions defined for the event log. + :param encoding: The encoding to be used (default: utf-8). + .. code-block:: python3 import pm4py @@ -73,19 +73,19 @@ def write_xes(log: Union[EventLog, pd.DataFrame], file_path: str, case_id_key: s def write_pnml(petri_net: PetriNet, initial_marking: Marking, final_marking: Marking, file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ - Writes a Petri net object to disk in the ``.pnml`` format (see `pnml-standard `_) + Writes a Petri net object to disk in the ``.pnml`` format (see `pnml-standard `_). - :param petri_net: Petri net object that needs to be written to disk - :param initial_marking: initial marking of the Petri net - :param final_marking: final marking of the Petri net - :param file_path: target file path on disk of the ``.pnml`` file - :param encoding: the encoding to be used (default: utf-8) + :param petri_net: Petri net object that needs to be written to disk. + :param initial_marking: Initial marking of the Petri net. + :param final_marking: Final marking of the Petri net. + :param file_path: Target file path on disk of the ``.pnml`` file. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_pnml(pn, im, fm, '') + pm4py.write_pnml(petri_net, initial_marking, final_marking, '') """ file_path = str(file_path) if not file_path.lower().endswith("pnml"): @@ -95,19 +95,20 @@ def write_pnml(petri_net: PetriNet, initial_marking: Marking, final_marking: Mar petri_exporter.apply(petri_net, initial_marking, file_path, final_marking=final_marking, parameters={"encoding": encoding}) -def write_ptml(tree: ProcessTree, file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: +def write_ptml(tree: ProcessTree, file_path: str, auto_layout: bool = True, encoding: str = constants.DEFAULT_ENCODING) -> None: """ Writes a process tree object to disk in the ``.ptml`` format. - :param tree: ProcessTree object that needs to be written to disk - :param file_path: target file path on disk of the ``.ptml`` file - :param encoding: the encoding to be used (default: utf-8) + :param tree: ProcessTree object that needs to be written to disk. + :param file_path: Target file path on disk of the ``.ptml`` file. + :param auto_layout: Boolean indicating whether the model should get an auto layout (which is written to disk). + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ptml(tree, '') + pm4py.write_ptml(tree, '', auto_layout=True) """ file_path = str(file_path) if not file_path.lower().endswith("ptml"): @@ -117,21 +118,21 @@ def write_ptml(tree: ProcessTree, file_path: str, encoding: str = constants.DEFA tree_exporter.apply(tree, file_path, parameters={"encoding": encoding}) -def write_dfg(dfg: Dict[Tuple[str,str],int], start_activities: Dict[str,int], end_activities: Dict[str,int], file_path: str, encoding: str = constants.DEFAULT_ENCODING): +def write_dfg(dfg: Dict[Tuple[str, str], int], start_activities: Dict[str, int], end_activities: Dict[str, int], file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ Writes a directly follows graph (DFG) object to disk in the ``.dfg`` format. - :param dfg: directly follows relation (multiset of activity-activity pairs) - :param start_activities: multiset tracking the number of occurrences of start activities - :param end_activities: mulltiset tracking the number of occurrences of end activities - :param file_path: target file path on disk to write the dfg object to - :param encoding: the encoding to be used (default: utf-8) + :param dfg: Directly follows relation (multiset of activity-activity pairs). + :param start_activities: Multiset tracking the number of occurrences of start activities. + :param end_activities: Multiset tracking the number of occurrences of end activities. + :param file_path: Target file path on disk to write the DFG object to. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_dfg(dfg, sa, ea, '') + pm4py.write_dfg(dfg, start_activities, end_activities, '') """ file_path = str(file_path) if not file_path.lower().endswith("dfg"): @@ -144,20 +145,20 @@ def write_dfg(dfg: Dict[Tuple[str,str],int], start_activities: Dict[str,int], e "encoding": encoding}) -def write_bpmn(model: BPMN, file_path: str, auto_layout: bool = True, encoding: str = constants.DEFAULT_ENCODING): +def write_bpmn(model: BPMN, file_path: str, auto_layout: bool = True, encoding: str = constants.DEFAULT_ENCODING) -> None: """ Writes a BPMN model object to disk in the ``.bpmn`` format. - :param model: BPMN model to export - :param file_path: target file path on disk to write the BPMN object to - :param auto_layout: boolean indicating whether the model should get an auto layout (which is written to disk) - :param encoding: the encoding to be used (default: utf-8) + :param model: BPMN model to export. + :param file_path: Target file path on disk to write the BPMN object to. + :param auto_layout: Boolean indicating whether the model should get an auto layout (which is written to disk). + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_bpmn(model, '') + pm4py.write_bpmn(model, '', auto_layout=True) """ file_path = str(file_path) if not file_path.lower().endswith("bpmn"): @@ -170,74 +171,75 @@ def write_bpmn(model: BPMN, file_path: str, auto_layout: bool = True, encoding: exporter.apply(model, file_path, parameters={"encoding": encoding}) -def write_ocel(ocel: OCEL, file_path: str, objects_path: str = None, encoding: str = constants.DEFAULT_ENCODING): +def write_ocel(ocel: OCEL, file_path: str, objects_path: str = None, encoding: str = constants.DEFAULT_ENCODING) -> None: """ - Writes an OCEL object to disk in the ``.bpmn`` format. - Different formats are supported, including CSV (flat table), JSON-OCEL, XML-OCEL and SQLite - (described in the site http://www.ocel-standard.org/). + Writes an OCEL object to disk in various formats. + Supported formats include CSV (flat table), JSON-OCEL, XML-OCEL, and SQLite + (described on the site https://www.ocel-standard.org/). - :param ocel: OCEL object to write to disk - :param file_path: target file path on disk to write the OCEL object to - :param objects_path: location of the objects table (only applicable in case of .csv exporting) - :param encoding: the encoding to be used (default: utf-8) + :param ocel: OCEL object to write to disk. + :param file_path: Target file path on disk to write the OCEL object to. + :param objects_path: Location of the objects table (only applicable in case of .csv exporting). + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ocel(ocel, '') + pm4py.write_ocel(ocel, '') """ file_path = str(file_path) if file_path.lower().endswith("csv"): - return write_ocel_csv(ocel, file_path, objects_path, encoding=encoding) + write_ocel_csv(ocel, file_path, objects_path, encoding=encoding) elif file_path.lower().endswith("jsonocel"): - return write_ocel_json(ocel, file_path, encoding=encoding) + write_ocel_json(ocel, file_path, encoding=encoding) elif file_path.lower().endswith("xmlocel"): - return write_ocel_xml(ocel, file_path, encoding=encoding) + write_ocel_xml(ocel, file_path, encoding=encoding) elif file_path.lower().endswith("sqlite"): - return write_ocel_sqlite(ocel, file_path, encoding=encoding) - raise Exception("unsupported file format") + write_ocel_sqlite(ocel, file_path, encoding=encoding) + else: + raise Exception("Unsupported file format.") -def write_ocel_csv(ocel: OCEL, file_path: str, objects_path: str, encoding: str = constants.DEFAULT_ENCODING): +def write_ocel_csv(ocel: OCEL, file_path: str, objects_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ Writes an OCEL object to disk in the ``.csv`` file format. The OCEL object is exported into two separate files, i.e., one event table and one objects table. - Both file paths should be specified + Both file paths should be specified. - :param ocel: OCEL object - :param file_path: target file path on disk to write the event table to - :param objects_path: target file path on disk to write the objects table to - :param encoding: the encoding to be used (default: utf-8) + :param ocel: OCEL object. + :param file_path: Target file path on disk to write the event table to. + :param objects_path: Target file path on disk to write the objects table to. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ocel_csv(ocel, '', '') + pm4py.write_ocel_csv(ocel, '', '') """ file_path = str(file_path) if not file_path.lower().endswith("csv"): file_path = file_path + ".csv" from pm4py.objects.ocel.exporter.csv import exporter as csv_exporter - return csv_exporter.apply(ocel, file_path, objects_path=objects_path, parameters={"encoding": encoding}) + csv_exporter.apply(ocel, file_path, objects_path=objects_path, parameters={"encoding": encoding}) -def write_ocel_json(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING): +def write_ocel_json(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ - Writes an OCEL object to disk in the ``.json`` file format (exported as ``.oceljson`` file). + Writes an OCEL object to disk in the ``.jsonocel`` file format. - :param ocel: OCEL object - :param file_path: target file path on disk to write the OCEL object to - :param encoding: the encoding to be used (default: utf-8) + :param ocel: OCEL object. + :param file_path: Target file path on disk to write the OCEL object to. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ocel_json(ocel, '') + pm4py.write_ocel_json(ocel, '') """ file_path = str(file_path) if not file_path.lower().endswith("jsonocel"): @@ -248,138 +250,141 @@ def write_ocel_json(ocel: OCEL, file_path: str, encoding: str = constants.DEFAUL is_ocel20 = ocel.is_ocel20() variant = jsonocel_exporter.Variants.OCEL20 if is_ocel20 else jsonocel_exporter.Variants.CLASSIC - return jsonocel_exporter.apply(ocel, file_path, variant=variant, parameters={"encoding": encoding}) + jsonocel_exporter.apply(ocel, file_path, variant=variant, parameters={"encoding": encoding}) -def write_ocel_xml(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING): +def write_ocel_xml(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ - Writes an OCEL object to disk in the ``.xml`` file format (exported as ``.ocelxml`` file). + Writes an OCEL object to disk in the ``.xmlocel`` file format. - :param ocel: OCEL object - :param file_path: target file path on disk to write the OCEL object to - :param encoding: the encoding to be used (default: utf-8) + :param ocel: OCEL object. + :param file_path: Target file path on disk to write the OCEL object to. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ocel_xml(ocel, '') + pm4py.write_ocel_xml(ocel, '') """ file_path = str(file_path) if not file_path.lower().endswith("xmlocel"): file_path = file_path + ".xmlocel" from pm4py.objects.ocel.exporter.xmlocel import exporter as xmlocel_exporter - return xmlocel_exporter.apply(ocel, file_path, variant=xmlocel_exporter.Variants.CLASSIC, parameters={"encoding": encoding}) + xmlocel_exporter.apply(ocel, file_path, variant=xmlocel_exporter.Variants.CLASSIC, parameters={"encoding": encoding}) -def write_ocel_sqlite(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING): +def write_ocel_sqlite(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ Writes an OCEL object to disk to a ``SQLite`` database (exported as ``.sqlite`` file). - :param ocel: OCEL object - :param file_path: target file path to the SQLite datbaase - :param encoding: the encoding to be used (default: utf-8) + :param ocel: OCEL object. + :param file_path: Target file path to the SQLite database. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ocel_sqlite(ocel, '') + pm4py.write_ocel_sqlite(ocel, '') """ file_path = str(file_path) if not file_path.lower().endswith("sqlite"): file_path = file_path + ".sqlite" from pm4py.objects.ocel.exporter.sqlite import exporter as sqlite_exporter - return sqlite_exporter.apply(ocel, file_path, variant=sqlite_exporter.Variants.PANDAS_EXPORTER, parameters={"encoding": encoding}) + sqlite_exporter.apply(ocel, file_path, variant=sqlite_exporter.Variants.PANDAS_EXPORTER, parameters={"encoding": encoding}) -def write_ocel2(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING): +def write_ocel2(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ - Writes an OCEL2.0 object to disk + Writes an OCEL2.0 object to disk in various formats. + Supported formats include JSON-OCEL, XML-OCEL, and SQLite. - :param ocel: OCEL object - :param file_path: target file path to the SQLite datbaase - :param encoding: the encoding to be used (default: utf-8) + :param ocel: OCEL object. + :param file_path: Target file path to write the OCEL2.0 object to. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ocel2(ocel, '') + pm4py.write_ocel2(ocel, '') """ file_path = str(file_path) if file_path.lower().endswith("sqlite"): - return write_ocel2_sqlite(ocel, file_path, encoding=encoding) + write_ocel2_sqlite(ocel, file_path, encoding=encoding) elif file_path.lower().endswith("xml") or file_path.lower().endswith("xmlocel"): - return write_ocel2_xml(ocel, file_path, encoding=encoding) + write_ocel2_xml(ocel, file_path, encoding=encoding) elif file_path.lower().endswith("jsonocel"): - return write_ocel2_json(ocel, file_path, encoding=encoding) + write_ocel2_json(ocel, file_path, encoding=encoding) + else: + raise Exception("Unsupported file format for OCEL2.0 export.") -def write_ocel2_json(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING): +def write_ocel2_json(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ - Writes an OCEL2.0 object to disk to an ``JSON`` file (exported as ``.jsonocel`` file). + Writes an OCEL2.0 object to disk in the ``.jsonocel`` file format. - :param ocel: OCEL object - :param file_path: target file path to the JSON file - :param encoding: the encoding to be used (default: utf-8) + :param ocel: OCEL object. + :param file_path: Target file path to the JSON-OCEL file. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ocel2_json(ocel, '') + pm4py.write_ocel2_json(ocel, '') """ file_path = str(file_path) - if "json" not in file_path: - file_path = file_path + ".json" + if not file_path.lower().endswith("jsonocel"): + file_path = file_path + ".jsonocel" from pm4py.objects.ocel.exporter.jsonocel import exporter as jsonocel_exporter - return jsonocel_exporter.apply(ocel, file_path, variant=jsonocel_exporter.Variants.OCEL20_STANDARD, parameters={"encoding": encoding}) + jsonocel_exporter.apply(ocel, file_path, variant=jsonocel_exporter.Variants.OCEL20_STANDARD, parameters={"encoding": encoding}) -def write_ocel2_sqlite(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING): +def write_ocel2_sqlite(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ Writes an OCEL2.0 object to disk to a ``SQLite`` database (exported as ``.sqlite`` file). - :param ocel: OCEL object - :param file_path: target file path to the SQLite datbaase - :param encoding: the encoding to be used (default: utf-8) + :param ocel: OCEL object. + :param file_path: Target file path to the SQLite database. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ocel2_sqlite(ocel, '') + pm4py.write_ocel2_sqlite(ocel, '') """ file_path = str(file_path) if not file_path.lower().endswith("sqlite"): file_path = file_path + ".sqlite" from pm4py.objects.ocel.exporter.sqlite import exporter as sqlite_exporter - return sqlite_exporter.apply(ocel, file_path, variant=sqlite_exporter.Variants.OCEL20, parameters={"encoding": encoding}) + sqlite_exporter.apply(ocel, file_path, variant=sqlite_exporter.Variants.OCEL20, parameters={"encoding": encoding}) -def write_ocel2_xml(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING): +def write_ocel2_xml(ocel: OCEL, file_path: str, encoding: str = constants.DEFAULT_ENCODING) -> None: """ - Writes an OCEL2.0 object to disk to an ``XML`` file (exported as ``.xmlocel`` file). + Writes an OCEL2.0 object to disk in the ``.xmlocel`` file format. - :param ocel: OCEL object - :param file_path: target file path to the XML file - :param encoding: the encoding to be used (default: utf-8) + :param ocel: OCEL object. + :param file_path: Target file path to the XML-OCEL file. + :param encoding: The encoding to be used (default: utf-8). .. code-block:: python3 import pm4py - log = pm4py.write_ocel2_xml(ocel, '') + pm4py.write_ocel2_xml(ocel, '') """ file_path = str(file_path) - if not file_path.lower().endswith("xml") and not file_path.lower().endswith("xmlocel"): + if not file_path.lower().endswith("xmlocel"): file_path = file_path + ".xmlocel" from pm4py.objects.ocel.exporter.xmlocel import exporter as xml_exporter - return xml_exporter.apply(ocel, file_path, variant=xml_exporter.Variants.OCEL20, parameters={"encoding": encoding}) + xml_exporter.apply(ocel, file_path, variant=xml_exporter.Variants.OCEL20, parameters={"encoding": encoding})