first commit
This commit is contained in:
45
evaluate/count_pass.py
Normal file
45
evaluate/count_pass.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import json
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
# Load the JSON file
|
||||
file_path = "solutions.json" # Adjust this path based on your local directory
|
||||
with open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Initialize a dictionary to store the structured results
|
||||
structured_results = defaultdict(lambda: defaultdict(lambda: {"total": 0, "pass": 0, "syntax_error": 0, "functional_error": 0}))
|
||||
|
||||
# Process the data to count various results per LLM and type
|
||||
for llm, categories in data.items():
|
||||
for category, modules in categories.items():
|
||||
for module in modules:
|
||||
for solution in module.get("solutions", []):
|
||||
structured_results[category][llm]["total"] += 1
|
||||
|
||||
pass_info = solution.get("pass", "")
|
||||
if pass_info == "true":
|
||||
structured_results[category][llm]["pass"] += 1
|
||||
elif "Detected error while running simulation" in pass_info:
|
||||
structured_results[category][llm]["syntax_error"] += 1
|
||||
|
||||
# Functional error count
|
||||
structured_results[category][llm]["functional_error"] = (
|
||||
structured_results[category][llm]["total"]
|
||||
- structured_results[category][llm]["syntax_error"]
|
||||
- structured_results[category][llm]["pass"]
|
||||
)
|
||||
|
||||
# Create a DataFrame from the structured results
|
||||
df_restructured = pd.DataFrame.from_dict(
|
||||
{category: {llm: f"{counts['pass']} | {counts['functional_error']} | {counts['syntax_error']}" for llm, counts in llms.items()}
|
||||
for category, llms in structured_results.items()},
|
||||
orient="index"
|
||||
)
|
||||
|
||||
# Save to a CSV file
|
||||
csv_output_path = "solution_pass_analysis.csv" # Adjust the path as needed
|
||||
df_restructured.to_csv(csv_output_path)
|
||||
|
||||
print(f"CSV file saved at: {csv_output_path}")
|
||||
# print(df_restructured)
|
||||
32
evaluate/count_resource.py
Normal file
32
evaluate/count_resource.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import json
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
|
||||
# Load the JSON file
|
||||
file_path = "solutions.json"
|
||||
with open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Initialize a dictionary to store the minimal LUT usage for each module and LLM
|
||||
lut_results = defaultdict(lambda: defaultdict(lambda: float("inf")))
|
||||
|
||||
# Process the data to extract the minimum LUT usage per module per LLM
|
||||
for llm, categories in data.items():
|
||||
for category, modules in categories.items():
|
||||
for module_data in modules:
|
||||
module_name = module_data["module"].replace("_", " ") # Replace underscores with spaces
|
||||
for solution in module_data.get("solutions", []):
|
||||
if "resource usage" in solution and "optimized" in solution["resource usage"]:
|
||||
lut_count = solution["resource usage"]["optimized"].get("LUT", float("inf"))
|
||||
# Store the minimum LUT usage
|
||||
lut_results[module_name][llm] = min(lut_results[module_name][llm], lut_count)
|
||||
|
||||
# Convert the dictionary into a DataFrame
|
||||
df_lut = pd.DataFrame.from_dict(lut_results, orient="index")
|
||||
|
||||
# Save to a CSV file
|
||||
csv_output_path = "solution_resource_analysis.csv"
|
||||
df_lut.to_csv(csv_output_path)
|
||||
|
||||
# Print the CSV file path
|
||||
print(f"CSV file saved at: {csv_output_path}")
|
||||
136
evaluate/plot_pass.py
Normal file
136
evaluate/plot_pass.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import json
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
|
||||
# --- Utility Functions ---
|
||||
|
||||
def compute_module_pass(solution_list, k):
|
||||
"""
|
||||
Check the first k solutions for a module.
|
||||
Return 1 if at least one of them has a "pass" value (after stripping and lowercasing) equal to "true",
|
||||
otherwise return 0.
|
||||
"""
|
||||
for sol in solution_list[:k]:
|
||||
if sol.get("pass", "").strip().lower() == "true":
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def compute_pass_at_k_for_modules(modules, k):
|
||||
"""
|
||||
Given a list of modules (each module is expected to have a "solutions" list),
|
||||
compute the fraction of modules that pass@k.
|
||||
"""
|
||||
total = len(modules)
|
||||
if total == 0:
|
||||
return 0
|
||||
passed = sum(compute_module_pass(mod["solutions"], k) for mod in modules)
|
||||
return passed / total
|
||||
|
||||
def compute_overall_pass_at_k(llm_data, ks):
|
||||
"""
|
||||
Given one LLM's data (a dict mapping category names to lists of modules),
|
||||
compute the overall pass@k (over all modules in all categories).
|
||||
Returns a dictionary mapping each k to the pass@k value.
|
||||
"""
|
||||
all_modules = []
|
||||
for cat, modules in llm_data.items():
|
||||
all_modules.extend(modules)
|
||||
overall = {}
|
||||
for k in ks:
|
||||
overall[k] = compute_pass_at_k_for_modules(all_modules, k)
|
||||
return overall
|
||||
|
||||
def compute_category_pass_at_k(llm_data, ks):
|
||||
"""
|
||||
For each category (type) in one LLM, compute pass@k.
|
||||
Returns a dictionary mapping category names to a dictionary of k -> pass@k.
|
||||
"""
|
||||
cat_results = {}
|
||||
for cat, modules in llm_data.items():
|
||||
k_dict = {}
|
||||
for k in ks:
|
||||
k_dict[k] = compute_pass_at_k_for_modules(modules, k)
|
||||
cat_results[cat] = k_dict
|
||||
return cat_results
|
||||
|
||||
# --- Main processing and plotting ---
|
||||
|
||||
# Choose the k values you want to evaluate pass@k for:
|
||||
ks = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
||||
|
||||
# Load the JSON file.
|
||||
input_json_file = "solutions.json" # adjust filename if necessary
|
||||
with open(input_json_file, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# We'll store our computed pass@k results per LLM in a dictionary.
|
||||
llm_results = {}
|
||||
for llm, llm_data in data.items():
|
||||
overall = compute_overall_pass_at_k(llm_data, ks)
|
||||
categories = compute_category_pass_at_k(llm_data, ks)
|
||||
llm_results[llm] = {
|
||||
"overall": overall,
|
||||
"categories": categories
|
||||
}
|
||||
|
||||
# --- Plot Overall Pass@k for each LLM ---
|
||||
plt.figure(figsize=(10, 6))
|
||||
for llm, res in llm_results.items():
|
||||
plt.plot(ks, [res["overall"][k] for k in ks], marker='o', label=llm)
|
||||
|
||||
# plt.xticks(ks) # Ensure all values from 1 to 15 are shown
|
||||
# plt.xlabel("k", fontsize=14)
|
||||
# plt.ylabel("Overall Pass@k", fontsize=14)
|
||||
# plt.title("Overall Pass@k across k for each LLM", fontsize=16) # Larger title
|
||||
# plt.legend(loc="upper left", bbox_to_anchor=(1, 1)) # Legend outside the plot
|
||||
# plt.grid(True)
|
||||
# plt.tight_layout()
|
||||
# plt.savefig("./figures/overall_pass_at_k.png")
|
||||
# plt.show()
|
||||
|
||||
|
||||
# --- Plot Per-Category Pass@k for all LLMs, one figure per k ---
|
||||
# First, determine the union of all categories across LLMs.
|
||||
# Prepare data for heatmap
|
||||
category_pass_k = {}
|
||||
for llm, res in llm_results.items():
|
||||
for cat, kdict in res["categories"].items():
|
||||
if cat not in category_pass_k:
|
||||
category_pass_k[cat] = {}
|
||||
category_pass_k[cat][llm] = kdict[15] # Using Pass@15
|
||||
|
||||
# Convert to DataFrame
|
||||
df_heatmap = pd.DataFrame.from_dict(category_pass_k).T
|
||||
|
||||
|
||||
for k in ks:
|
||||
# Convert to DataFrame
|
||||
df_heatmap = pd.DataFrame.from_dict(category_pass_k).T
|
||||
|
||||
# Plot heatmap
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.heatmap(df_heatmap, annot=True, cmap="Blues", linewidths=0.5, fmt=".2f")
|
||||
|
||||
plt.title("Pass@15 Heatmap for Each LLM Across Categories", fontsize=16, fontweight="bold")
|
||||
plt.xlabel("LLM", fontsize=14, fontweight="bold")
|
||||
plt.ylabel("Category", fontsize=14, fontweight="bold")
|
||||
|
||||
plt.xticks(rotation=45, ha="right", fontsize=12)
|
||||
plt.yticks(fontsize=12)
|
||||
|
||||
plt.tight_layout()
|
||||
heatmap_path = f"./figures/per_category_pass_k{k}_heatmap.png"
|
||||
plt.savefig(heatmap_path)
|
||||
|
||||
# --- (Optional) Print the computed results ---
|
||||
print("Overall Pass@k per LLM:")
|
||||
for llm, res in llm_results.items():
|
||||
print(f"{llm}: {res['overall']}")
|
||||
|
||||
print("\nPer-Category Pass@k per LLM:")
|
||||
for llm, res in llm_results.items():
|
||||
print(f"{llm}:")
|
||||
for cat, kdict in res["categories"].items():
|
||||
print(f" {cat}: {kdict}")
|
||||
13
evaluate/solution_pass_analysis.csv
Normal file
13
evaluate/solution_pass_analysis.csv
Normal file
@@ -0,0 +1,13 @@
|
||||
,gpt-3.5-turbo,gpt-4,gpt-4o,gpt-o1-mini,llama3.1-405B,qwen-max,qwen-plus,qwen2.5-coder-32B-instruct,codestral
|
||||
Combinational Logic,112 | 5 | 3,117 | 3 | 0,120 | 0 | 0,118 | 1 | 1,115 | 2 | 3,117 | 2 | 1,109 | 1 | 10,112 | 2 | 6,120 | 0 | 0
|
||||
Finite State Machines,23 | 15 | 22,32 | 22 | 6,31 | 24 | 5,39 | 18 | 3,31 | 24 | 5,34 | 26 | 0,27 | 23 | 10,39 | 10 | 11,36 | 6 | 18
|
||||
Mathematical Functions,13 | 19 | 43,6 | 39 | 30,36 | 10 | 29,46 | 24 | 5,7 | 6 | 62,26 | 27 | 22,20 | 26 | 29,5 | 8 | 62,0 | 3 | 72
|
||||
Basic Arithmetic Operations,37 | 2 | 36,63 | 8 | 4,66 | 9 | 0,68 | 4 | 3,43 | 2 | 30,38 | 22 | 15,27 | 13 | 35,54 | 6 | 15,62 | 13 | 0
|
||||
Bitwise and Logical Operations,35 | 0 | 25,55 | 0 | 5,58 | 2 | 0,59 | 0 | 1,52 | 0 | 8,47 | 0 | 13,33 | 11 | 16,36 | 0 | 24,55 | 0 | 5
|
||||
Pipelining,0 | 59 | 16,11 | 54 | 10,26 | 49 | 0,15 | 38 | 22,7 | 38 | 30,15 | 32 | 28,16 | 26 | 33,21 | 31 | 23,6 | 56 | 13
|
||||
Polynomial Evaluation,19 | 3 | 53,69 | 0 | 6,74 | 1 | 0,68 | 5 | 2,58 | 6 | 11,55 | 2 | 18,28 | 5 | 42,65 | 7 | 3,69 | 6 | 0
|
||||
Machine Learning,31 | 3 | 41,60 | 8 | 7,60 | 13 | 2,73 | 1 | 1,45 | 28 | 2,63 | 12 | 0,61 | 12 | 2,57 | 2 | 16,64 | 8 | 3
|
||||
Financial Computing,9 | 23 | 28,21 | 22 | 17,29 | 13 | 18,20 | 20 | 20,11 | 21 | 28,28 | 15 | 17,15 | 12 | 33,16 | 7 | 37,17 | 23 | 20
|
||||
Encryption,30 | 0 | 15,30 | 2 | 13,25 | 20 | 0,30 | 0 | 15,26 | 0 | 19,25 | 9 | 11,30 | 1 | 14,30 | 0 | 15,30 | 0 | 15
|
||||
Physics,45 | 3 | 12,57 | 0 | 3,53 | 4 | 3,54 | 5 | 1,41 | 11 | 8,49 | 7 | 4,40 | 17 | 3,38 | 15 | 7,55 | 2 | 3
|
||||
Climate,8 | 15 | 37,21 | 30 | 9,41 | 11 | 8,41 | 15 | 4,24 | 23 | 13,38 | 19 | 3,19 | 31 | 10,32 | 14 | 14,28 | 19 | 13
|
||||
|
57
evaluate/solution_resource_analysis.csv
Normal file
57
evaluate/solution_resource_analysis.csv
Normal file
@@ -0,0 +1,57 @@
|
||||
,gpt-3.5-turbo,gpt-4,gpt-4o,gpt-o1-mini,llama3.1-405B,qwen-max,qwen-plus,qwen2.5-coder-32B-instruct,codestral
|
||||
parity 8bit,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
|
||||
mux4to1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
|
||||
majority,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
|
||||
bin to gray,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
|
||||
eq comparator,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
|
||||
decoder 2to4,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
|
||||
seven segment decoder,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
|
||||
priority encoder,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
|
||||
fsm 3state,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
||||
traffic light,1.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,inf
|
||||
elevator controller,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
|
||||
vending machine,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0
|
||||
int sqrt,inf,inf,68.0,177.0,inf,64.0,229.0,173.0,inf
|
||||
fibonacci,inf,56.0,1.0,56.0,56.0,56.0,inf,inf,inf
|
||||
mod exp,inf,inf,4466.0,4669.0,inf,1911.0,1678.0,inf,inf
|
||||
power,inf,79.0,74.0,93.0,inf,93.0,93.0,93.0,inf
|
||||
log2 int,inf,inf,inf,10.0,20.0,inf,inf,12.0,inf
|
||||
add 8bit,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
|
||||
mult 4bit,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
|
||||
abs diff,12.0,12.0,14.0,12.0,12.0,inf,12.0,12.0,12.0
|
||||
modulo op,82.0,82.0,82.0,82.0,111.0,inf,inf,inf,inf
|
||||
subtract 8bit,8.0,8.0,8.0,8.0,inf,inf,inf,8.0,8.0
|
||||
bitwise ops,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
|
||||
left shift,10.0,10.0,10.0,10.0,10.0,12.0,12.0,10.0,10.0
|
||||
bitwise not,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
|
||||
rotate left,inf,12.0,12.0,12.0,12.0,12.0,inf,12.0,12.0
|
||||
pipelined adder,inf,0.0,16.0,inf,0.0,inf,0.0,15.0,inf
|
||||
pipelined multiplier,inf,inf,77.0,70.0,56.0,inf,70.0,inf,inf
|
||||
pipelined accumulator,inf,inf,inf,inf,27.0,inf,inf,inf,inf
|
||||
pipelined max finder,inf,0.0,24.0,0.0,24.0,24.0,24.0,24.0,24.0
|
||||
pipelined fir,inf,inf,inf,inf,inf,inf,inf,inf,inf
|
||||
polynomial 1,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
|
||||
polynomial 2,49.0,49.0,0.0,91.0,0.0,91.0,0.0,91.0,49.0
|
||||
polynomial 3,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
|
||||
polynomial 4,64.0,33.0,96.0,11.0,108.0,108.0,26.0,18.0,33.0
|
||||
polynomial 5,inf,0.0,213.0,59.0,16.0,213.0,16.0,16.0,16.0
|
||||
matrix vector mult,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
|
||||
relu,8.0,8.0,8.0,8.0,8.0,16.0,8.0,8.0,16.0
|
||||
gradient descent,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0
|
||||
mse loss,inf,216.0,64.0,64.0,216.0,64.0,216.0,64.0,64.0
|
||||
conv2d,inf,0.0,0.0,0.0,inf,0.0,0.0,0.0,0.0
|
||||
compound interest,inf,13060.0,10135.0,10135.0,52950.0,9247.0,inf,10135.0,52950.0
|
||||
ddm,inf,815.0,inf,inf,inf,inf,inf,inf,inf
|
||||
present value,107946.0,107946.0,107946.0,107946.0,107946.0,107946.0,107946.0,107946.0,107946.0
|
||||
currency converter,inf,inf,0.0,0.0,25.0,0.0,inf,inf,inf
|
||||
caesar cipher,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
|
||||
modular add cipher,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
|
||||
feistel cipher,inf,inf,inf,inf,inf,inf,inf,inf,inf
|
||||
free fall distance,6.0,6.0,64.0,6.0,6.0,64.0,67.0,64.0,6.0
|
||||
kinetic energy,70.0,70.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
|
||||
potential energy,6.0,6.0,84.0,0.0,6.0,6.0,6.0,6.0,6.0
|
||||
wavelength,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
|
||||
carbon footprint,174.0,121.0,110.0,92.0,121.0,121.0,110.0,110.0,110.0
|
||||
heat index,16.0,16.0,201.0,16.0,195.0,16.0,124.0,201.0,201.0
|
||||
air quality index,inf,inf,128.0,104.0,inf,104.0,116.0,128.0,128.0
|
||||
solar radiation average,inf,inf,44.0,44.0,44.0,44.0,inf,44.0,inf
|
||||
|
Reference in New Issue
Block a user