Skip to content

Commit

Permalink
Fix time units (#51)
Browse files Browse the repository at this point in the history
Summary:
## What does this PR do?
Fixes #50

## Before submitting

- [x] Was this discussed/approved via a Github issue? (no need for typos, doc improvements)
  - [ ] N/A
- [ ] Did you write any new necessary tests?
  - [x] N/A
- [x] Did you make sure to update the docs?
  - [ ] N/A
- [ ] Did you update the [changelog](https://github.com/facebookresearch/HolisticTraceAnalysis/blob/main/CHANGELOG.md)?
  - [x] N/A

Pull Request resolved: #51

Reviewed By: sunghlin

Differential Revision: D45823760

Pulled By: anupambhatnagar

fbshipit-source-id: 5604bec975ec06cf44022bd6f2ddc5ab4eaf42bc
  • Loading branch information
anupambhatnagar authored and facebook-github-bot committed May 15, 2023
1 parent 86f0cdc commit c3c2d6b
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 31 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ HTA provides the following features:
insights into memory bandwidth utilized and number of outstanding operations on each CUDA stream.
1. __Trace Comparison__ - A trace comparison tool to identify and visualize the differences between
traces.
1. __CUPTI Counter Analysis__ - An experimental API to get GPU performance counters. By attributing
performance measurements from kernels to PyTorch operators roofline analysis can be performed and
kernels can be optimized.

## Installation

Expand Down
Binary file modified docs/source/_static/kernel_metrics_df.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
50 changes: 25 additions & 25 deletions hta/analyzers/breakdown_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,10 @@ def get_gpu_kernel_breakdown(
)
all_kernel_df.rename(
columns={
"sum": "sum (ns)",
"mean": "mean (ns)",
"max": "max (ns)",
"min": "min (ns)",
"sum": "sum (us)",
"mean": "mean (us)",
"max": "max (us)",
"min": "min (us)",
"std": "stddev",
},
inplace=True,
Expand Down Expand Up @@ -191,16 +191,16 @@ def get_gpu_kernel_breakdown(
fig = px.bar(
kernel_name_df,
x="rank",
y="mean (ns)",
y="mean (us)",
title=name,
labels={
"rank": "Rank",
"mean (ns)": "Mean Duration (ns)",
"mean (us)": "Mean Duration (us)",
},
error_y=kernel_name_df["max (ns)"]
- kernel_name_df["mean (ns)"],
error_y_minus=kernel_name_df["mean (ns)"]
- kernel_name_df["min (ns)"],
error_y=kernel_name_df["max (us)"]
- kernel_name_df["mean (us)"],
error_y_minus=kernel_name_df["mean (us)"]
- kernel_name_df["min (us)"],
)
fig.update_layout(
title_text=f'Kernel type "{kernel}" - {name}',
Expand Down Expand Up @@ -313,8 +313,8 @@ def _get_idle_time_for_kernels(cls, kernels_df: pd.DataFrame) -> Tuple[int, int]
"""
Compute idle time for given set of GPU kernels :
returns :
idle time (ns) = kernel time - merged execution time of all kernels
kernel time (ns) = defined as the time difference between end of the
idle time (us) = kernel time - merged execution time of all kernels
kernel time (us) = defined as the time difference between end of the
last kernel and start of the first kernel.
PS: we exclude the last profiler iteration while reading trace
so total time is exclusive of that.
Expand All @@ -333,7 +333,7 @@ def get_temporal_breakdown(cls, t: "Trace", visualize: bool = True) -> pd.DataFr
sym_table = t.symbol_table.get_sym_table()

def idle_time_per_rank(trace_df: pd.DataFrame) -> Tuple[int, int, int, int]:
"""returns idle_time (ns) , compute_time (ns), non_compute_time (ns), total_time (ns)"""
"""returns idle_time (us) , compute_time (us), non_compute_time (us), total_time (us)"""
gpu_kernels = trace_df[trace_df["stream"].ne(-1)].copy()
idle_time, kernel_time = cls._get_idle_time_for_kernels(gpu_kernels)

Expand Down Expand Up @@ -362,22 +362,22 @@ def idle_time_per_rank(trace_df: pd.DataFrame) -> Tuple[int, int, int, int]:
idle_time, compute_time, non_compute_time, kernel_time = idle_time_per_rank(
trace_df
)
result["idle_time(ns)"].append(idle_time)
result["compute_time(ns)"].append(compute_time)
result["non_compute_time(ns)"].append(non_compute_time)
result["kernel_time(ns)"].append(kernel_time)
result["idle_time(us)"].append(idle_time)
result["compute_time(us)"].append(compute_time)
result["non_compute_time(us)"].append(non_compute_time)
result["kernel_time(us)"].append(kernel_time)

result_df = pd.DataFrame(result)
result_df["idle_time"] = (
result_df["idle_time(ns)"] / result_df["kernel_time(ns)"]
result_df["idle_time(us)"] / result_df["kernel_time(us)"]
)
result_df["idle_time_pctg"] = round(100 * result_df["idle_time"], 2)
result_df["compute_time"] = (
result_df["compute_time(ns)"] / result_df["kernel_time(ns)"]
result_df["compute_time(us)"] / result_df["kernel_time(us)"]
)
result_df["compute_time_pctg"] = round(100 * result_df["compute_time"], 2)
result_df["non_compute_time"] = (
result_df["non_compute_time(ns)"] / result_df["kernel_time(ns)"]
result_df["non_compute_time(us)"] / result_df["kernel_time(us)"]
)
result_df["non_compute_time_pctg"] = round(
100 * result_df["non_compute_time"], 2
Expand All @@ -403,10 +403,10 @@ def idle_time_per_rank(trace_df: pd.DataFrame) -> Tuple[int, int, int, int]:
return result_df[
[
"rank",
"idle_time(ns)",
"compute_time(ns)",
"non_compute_time(ns)",
"kernel_time(ns)",
"idle_time(us)",
"compute_time(us)",
"non_compute_time(us)",
"kernel_time(us)",
"idle_time_pctg",
"compute_time_pctg",
"non_compute_time_pctg",
Expand Down Expand Up @@ -563,7 +563,7 @@ def get_idle_time_breakdown(
)
else:
fig.update_layout(
yaxis_title="Idle time (ns)", legend_title="Idle Time Breakdown"
yaxis_title="Idle time (us)", legend_title="Idle Time Breakdown"
)
fig.show()

Expand Down
4 changes: 2 additions & 2 deletions hta/analyzers/cuda_kernel_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,10 @@ def _generate_frequent_pattern_results(
for pattern, count in pattern_counts.items():
patterns_result["pattern"].append("|".join(sym_table[x] for x in pattern))
patterns_result["count"].append(count)
patterns_result["GPU kernel duration (ns)"].append(
patterns_result["GPU kernel duration (us)"].append(
pattern_durations[pattern][0]
)
patterns_result["CPU op duration (ns)"].append(
patterns_result["CPU op duration (us)"].append(
pattern_durations[pattern][1]
)
patterns_result["pattern_indices"].append(pattern_occurrences[pattern])
Expand Down
8 changes: 4 additions & 4 deletions tests/test_trace_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ def test_frequent_cuda_kernel_sequences(self, mock_write_trace):
)
self.assertEqual(frequent_patterns_dfs.iloc[2]["count"], 48)
self.assertEqual(
frequent_patterns_dfs.iloc[2]["GPU kernel duration (ns)"], 11300
frequent_patterns_dfs.iloc[2]["GPU kernel duration (us)"], 11300
)
self.assertEqual(frequent_patterns_dfs.iloc[2]["CPU op duration (ns)"], 9652)
self.assertEqual(frequent_patterns_dfs.iloc[2]["CPU op duration (us)"], 9652)
mock_write_trace.assert_called_once()
trace_output_filename, _ = mock_write_trace.call_args.args
self.assertEqual(trace_output_filename, self.overlaid_trace_file)
Expand Down Expand Up @@ -188,9 +188,9 @@ def test_get_gpu_kernel_breakdown(self):
self.assertEqual(kernel_type_breakdown.iloc[0]["kernel_type"], "COMMUNICATION")
self.assertEqual(kernel_type_breakdown.iloc[0]["sum"], 8040285)
self.assertEqual(kernel_breakdown.iloc[0]["kernel_type"], "COMMUNICATION")
self.assertEqual(kernel_breakdown.iloc[0]["sum (ns)"], 627683)
self.assertEqual(kernel_breakdown.iloc[0]["sum (us)"], 627683)
self.assertEqual(kernel_breakdown.iloc[151]["kernel_type"], "MEMORY")
self.assertEqual(kernel_breakdown.iloc[151]["sum (ns)"], 1064)
self.assertEqual(kernel_breakdown.iloc[151]["sum (us)"], 1064)

def test_get_queue_length_summary(self):
qd_summary = self.vision_transformer_t.get_queue_length_summary(ranks=[0])
Expand Down

0 comments on commit c3c2d6b

Please sign in to comment.