1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200#!/usr/bin/env python3
"""
Cache Effectiveness Analysis Script
This script analyzes task statistics to identify which tasks are not getting
significant benefit from caching and would be candidates for removing the
caching layer.
To use this script, run: a build with `NEXT_TURBOPACK_TASK_STATISTICS=path/to/stats.json` set
Then run this script with the path to the stats.json file to get a report on optimization opportunities.
Based on benchmarking data from the `turbopack/crates/turbo-tasks-backend/benches/overhead.rs` benchmark we have the following estimates:
- Cache hit cost: 200-500ns
- Execution overhead: 4-6us
- Measurement overhead: 260ns-750ns
This script assumes the best case scenario and reports on the potential time savings from removing the caching layer.
"""
import json
import sys
from typing import Dict, List, Tuple
from dataclasses import dataclass
@dataclass
class TaskStats:
name: str
cache_hit: int
cache_miss: int
executions: int
duration_ns: int
@property
def total_operations(self) -> int:
return self.cache_hit + self.cache_miss
@property
def cache_hit_rate(self) -> float:
if self.total_operations == 0:
return 0.0
return self.cache_hit / self.total_operations
@property
def avg_execution_time_ns(self) -> int:
MEASUREMENT_OVERHEAD = 750 # OVerhead implicit in the reported duration
if self.executions == 0:
return 0
return max(0, (self.duration_ns - MEASUREMENT_OVERHEAD * self.executions) // self.executions)
def parse_duration(duration_dict: Dict) -> int:
"""Convert duration dict to nanoseconds."""
return duration_dict.get("secs", 0) * 1_000_000_000 + duration_dict.get("nanos", 0)
def load_task_stats(file_path: str) -> List[TaskStats]:
"""Load and parse task statistics from JSON file."""
with open(file_path, 'r') as f:
data = json.load(f)
tasks = []
for task_name, stats in data.items():
duration_ns = parse_duration(stats["duration"])
task = TaskStats(
name=task_name,
cache_hit=stats["cache_hit"],
cache_miss=stats["cache_miss"],
executions=stats["executions"],
duration_ns=duration_ns
)
tasks.append(task)
return tasks
def calculate_cache_effectiveness(task: TaskStats) -> float:
"""
Calculate the effectiveness of caching for a task.
Returns:
Time savings from removing caching (negative means caching is beneficial)
"""
# Constants based on benchmarking
# These are optimistic estimates
CACHE_HIT_COST_NS = 500 # Average of 200-500ns
EXECUTION_OVERHEAD_NS = 6000 # Average of 4-6us (caching layer overhead)
MEASUREMENT_OVERHEAD = 750 # OVerhead implicit in the reported duration
if task.total_operations == 0:
return 0.0
# Current cost with caching
# Cache hits: just the cache lookup cost
# Cache misses: cache overhead + actual execution time
cache_hit_cost = task.cache_hit * CACHE_HIT_COST_NS
cache_miss_cost = task.cache_miss * (EXECUTION_OVERHEAD_NS + task.avg_execution_time_ns)
current_total_cost = cache_hit_cost + cache_miss_cost
# Cost without caching (all operations would be direct executions, no overhead)
no_cache_cost = task.total_operations * task.avg_execution_time_ns
# Time savings from removing caching (positive means we save time by removing cache)
time_savings = current_total_cost - no_cache_cost
return time_savings
def analyze_tasks(tasks: List[TaskStats]) -> List[Tuple[TaskStats, float]]:
"""Analyze all tasks and return sorted by potential time savings."""
results = []
for task in tasks:
results.append((task, calculate_cache_effectiveness(task)))
# Sort by time savings (descending - highest savings first)
results.sort(key=lambda x: x[1], reverse=True)
return results
def format_time(nanoseconds: float) -> str:
"""Format time in appropriate units (ns, μs, ms, s)."""
sign = "-" if nanoseconds < 0 else ""
nanoseconds = abs(nanoseconds)
if nanoseconds >= 1_000_000_000: # >= 1 second
return f"{sign}{nanoseconds / 1_000_000_000:.2f}s"
elif nanoseconds >= 1_000_000: # >= 1 millisecond
return f"{sign}{nanoseconds / 1_000_000:.2f}ms"
elif nanoseconds >= 1_000: # >= 1 microsecond
return f"{sign}{nanoseconds / 1_000:.1f}μs"
else: # nanoseconds
return f"{sign}{nanoseconds:.0f}ns"
def print_analysis(results: List[Tuple[TaskStats, float]]):
"""Print the analysis results."""
print("Tasks ranked by estimated time savings from removing caching layer")
print()
if not results:
print("No tasks would benefit from removing caching.")
return
# Print header
header = (f"{'Savings':<10} {'Hit Rate':<8} {'Exec Time':<10} "
f"{'Operations':<10} {'Task Name'}")
print(header)
print("-" * len(header))
# Print results
for (task, time_savings) in results:
savings_str = format_time(time_savings)
hit_rate_str = f"{task.cache_hit_rate:.1%}"
exec_time_str = format_time(task.avg_execution_time_ns)
operations_str = f"{task.total_operations:,}"
print(f"{savings_str:<10} {hit_rate_str:<8} {exec_time_str:<10} "
f"{operations_str:<10} {task.name}")
# Print summary
total_savings = sum(time_savings if time_savings > 0 else 0 for _, time_savings in results)
print()
print(f"Summary: {sum(1 if time_savings > 0 else 0 for _, time_savings in results)} tasks would benefit from removing caching")
print(f"Total potential savings: {format_time(total_savings)}")
print()
print("Legend:")
print("- Savings: Time saved by removing caching layer")
print("- Hit Rate: Percentage of operations that were cache hits")
print("- Exec Time: Average execution time per operation")
print("- Operations: Total number of cache hits + misses")
def main():
if len(sys.argv) != 2:
print("Usage: python analyze_cache_effectiveness.py <stats-durations.json>")
sys.exit(1)
file_path = sys.argv[1]
try:
tasks = load_task_stats(file_path)
results = analyze_tasks(tasks)
print_analysis(results)
except FileNotFoundError:
print(f"Error: File '{file_path}' not found")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()