first commit

2025-02-20 20:38:50 +00:00
commit ae1cc41f21
15 changed files with 119048 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,109 @@
 # FPGA Resource Usage Benchmarking for Verilog Solutions
 This repository contains the code for benchmarking FPGA resource usage for Verilog solutions generated by different LLMs.
 ## Simulation and Synthesis Tools
 The simulation tool for functional correctness tests and the synthesis tool for obtaining resource usage are both based on **Vivado**. Please install **Vivado** in advance to run the framework. If you wish to use other tools, modify the relevant Python scripts accordingly.
 Some dependencies are listed in `requirements.txt`, which you can install using:
 ```sh
 pip install -r requirements.txt
 ```
 ## Benchmark Dataset (`problems.json`)
 The `problems.json` file contains our benchmark dataset, formatted as follows:
 ```json
 {
    "Combinational Logic": [
        {
            "module": "parity_8bit",
            "Problem": "Implement a Verilog module that computes the parity of an 8-bit input vector. The output should be 1 if the number of '1's in the input is odd, and 0 otherwise.",
            "Module header": "module parity_8bit (\n    input [7:0] in,\n    output out\n);",
            "Testbench": "`timescale 1ns / 1ps\n\nmodule parity_8bit_tb; ..."
        }
    ],
    "Finite State Machines": []
 }
 ```
 You can use this dataset to generate solutions and run functional correctness checks for any LLMs you want to evaluate.
 ## Experimental Results (`solutions.json` Format)
 The `solutions` directory contains our experimental results, formatted as follows:
 ```json
 {
    "gpt-3.5-turbo": {
        "Combinational Logic": [
            {
                "module": "parity_8bit",
                "solutions": [
                    {
                        "solution": "module parity_8bit (input [7:0] in, output out); assign out = in[0] ^ in[1] ^ in[2] ^ in[3] ^ in[4] ^ in[5] ^ in[6] ^ in[7]; endmodule",
                        "pass": "true",
                        "resource usage": {
                            "optimized": {
                                "LUT": 2,
                                "FF": 0,
                                "DSP": 0,
                                "BRAM": 0,
                                "IO": 9
                            },
                            "primitives": {
                                "LUT": 2,
                                "FF": 0,
                                "DSP": 0,
                                "BRAM": 0,
                                "IO": 9
                            }
                        }
                    }
                ]
            }
        ],
        "Finite State Machines": [
            {
                "module": "fsm_3state",
                "solutions": []
            }
        ]
    },
    "gpt-4o":{}
 }
 ```
 ## Quick Run Instructions
 To quickly run the benchmarking process, copy `solutions.json` from the `solutions` directory to the same directory as `setup.py`, then execute:
 ```sh
 python setup.py -model gpt-4o 5 your_openai_api_key -functional_correctness -resource_usage
 ```
 This command will:
 1. Generate 5 solutions for each problem using `gpt-4o`.
 2. Run the functional correctness check.
 3. Obtain the resource usage report for LUT usage.
 The standard script currently supports OpenAI's GPT models. If you want to test other LLMs, please modify `generate_solutions.py` accordingly.
 ## Running Functional and Resource Usage Tests on Custom Solutions
 You can also run the functional test and resource usage analysis on your own solutions. Ensure that your `solutions.json` follows the format above and place it in the same directory as `setup.py`, then execute:
 ```sh
 python setup.py -functional_correctness -resource_usage
 ```
 ## Running Individual Tests
 To run the **functional correctness check** alone:
 ```sh
 python setup.py -functional_correctness
 ```
 To run **resource usage analysis** alone:
 ```sh
 python setup.py -resource_usage
 ```
--- a/evaluate/count_pass.py
+++ b/evaluate/count_pass.py
@@ -0,0 +1,45 @@
 import json
 import pandas as pd
 from collections import defaultdict
 # Load the JSON file
 file_path = "solutions.json"  # Adjust this path based on your local directory
 with open(file_path, "r") as f:
    data = json.load(f)
 # Initialize a dictionary to store the structured results
 structured_results = defaultdict(lambda: defaultdict(lambda: {"total": 0, "pass": 0, "syntax_error": 0, "functional_error": 0}))
 # Process the data to count various results per LLM and type
 for llm, categories in data.items():
    for category, modules in categories.items():
        for module in modules:
            for solution in module.get("solutions", []):
                structured_results[category][llm]["total"] += 1
                pass_info = solution.get("pass", "")
                if pass_info == "true":
                    structured_results[category][llm]["pass"] += 1
                elif "Detected error while running simulation" in pass_info:
                    structured_results[category][llm]["syntax_error"] += 1
                # Functional error count
                structured_results[category][llm]["functional_error"] = (
                    structured_results[category][llm]["total"]
                    - structured_results[category][llm]["syntax_error"]
                    - structured_results[category][llm]["pass"]
                )
 # Create a DataFrame from the structured results
 df_restructured = pd.DataFrame.from_dict(
    {category: {llm: f"{counts['pass']} | {counts['functional_error']} | {counts['syntax_error']}" for llm, counts in llms.items()}
     for category, llms in structured_results.items()},
    orient="index"
 )
 # Save to a CSV file
 csv_output_path = "solution_pass_analysis.csv"  # Adjust the path as needed
 df_restructured.to_csv(csv_output_path)
 print(f"CSV file saved at: {csv_output_path}")
 # print(df_restructured)
--- a/evaluate/count_resource.py
+++ b/evaluate/count_resource.py
@@ -0,0 +1,32 @@
 import json
 import pandas as pd
 from collections import defaultdict
 # Load the JSON file
 file_path = "solutions.json"
 with open(file_path, "r") as f:
    data = json.load(f)
 # Initialize a dictionary to store the minimal LUT usage for each module and LLM
 lut_results = defaultdict(lambda: defaultdict(lambda: float("inf")))
 # Process the data to extract the minimum LUT usage per module per LLM
 for llm, categories in data.items():
    for category, modules in categories.items():
        for module_data in modules:
            module_name = module_data["module"].replace("_", " ")  # Replace underscores with spaces
            for solution in module_data.get("solutions", []):
                if "resource usage" in solution and "optimized" in solution["resource usage"]:
                    lut_count = solution["resource usage"]["optimized"].get("LUT", float("inf"))
                    # Store the minimum LUT usage
                    lut_results[module_name][llm] = min(lut_results[module_name][llm], lut_count)
 # Convert the dictionary into a DataFrame
 df_lut = pd.DataFrame.from_dict(lut_results, orient="index")
 # Save to a CSV file
 csv_output_path = "solution_resource_analysis.csv"
 df_lut.to_csv(csv_output_path)
 # Print the CSV file path
 print(f"CSV file saved at: {csv_output_path}")
--- a/evaluate/plot_pass.py
+++ b/evaluate/plot_pass.py
@@ -0,0 +1,136 @@
 import json
 import matplotlib.pyplot as plt
 import re
 import seaborn as sns
 import pandas as pd
 # --- Utility Functions ---
 def compute_module_pass(solution_list, k):
    """
    Check the first k solutions for a module.
    Return 1 if at least one of them has a "pass" value (after stripping and lowercasing) equal to "true",
    otherwise return 0.
    """
    for sol in solution_list[:k]:
        if sol.get("pass", "").strip().lower() == "true":
            return 1
    return 0
 def compute_pass_at_k_for_modules(modules, k):
    """
    Given a list of modules (each module is expected to have a "solutions" list),
    compute the fraction of modules that pass@k.
    """
    total = len(modules)
    if total == 0:
        return 0
    passed = sum(compute_module_pass(mod["solutions"], k) for mod in modules)
    return passed / total
 def compute_overall_pass_at_k(llm_data, ks):
    """
    Given one LLM's data (a dict mapping category names to lists of modules),
    compute the overall pass@k (over all modules in all categories).
    Returns a dictionary mapping each k to the pass@k value.
    """
    all_modules = []
    for cat, modules in llm_data.items():
        all_modules.extend(modules)
    overall = {}
    for k in ks:
        overall[k] = compute_pass_at_k_for_modules(all_modules, k)
    return overall
 def compute_category_pass_at_k(llm_data, ks):
    """
    For each category (type) in one LLM, compute pass@k.
    Returns a dictionary mapping category names to a dictionary of k -> pass@k.
    """
    cat_results = {}
    for cat, modules in llm_data.items():
        k_dict = {}
        for k in ks:
            k_dict[k] = compute_pass_at_k_for_modules(modules, k)
        cat_results[cat] = k_dict
    return cat_results
 # --- Main processing and plotting ---
 # Choose the k values you want to evaluate pass@k for:
 ks = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
 # Load the JSON file.
 input_json_file = "solutions.json"  # adjust filename if necessary
 with open(input_json_file, "r") as f:
    data = json.load(f)
 # We'll store our computed pass@k results per LLM in a dictionary.
 llm_results = {}
 for llm, llm_data in data.items():
    overall = compute_overall_pass_at_k(llm_data, ks)
    categories = compute_category_pass_at_k(llm_data, ks)
    llm_results[llm] = {
        "overall": overall,
        "categories": categories
    }
 # --- Plot Overall Pass@k for each LLM ---
 plt.figure(figsize=(10, 6))
 for llm, res in llm_results.items():
    plt.plot(ks, [res["overall"][k] for k in ks], marker='o', label=llm)
 # plt.xticks(ks)  # Ensure all values from 1 to 15 are shown
 # plt.xlabel("k", fontsize=14)
 # plt.ylabel("Overall Pass@k", fontsize=14)
 # plt.title("Overall Pass@k across k for each LLM", fontsize=16)  # Larger title
 # plt.legend(loc="upper left", bbox_to_anchor=(1, 1))  # Legend outside the plot
 # plt.grid(True)
 # plt.tight_layout()
 # plt.savefig("./figures/overall_pass_at_k.png")
 # plt.show()
 # --- Plot Per-Category Pass@k for all LLMs, one figure per k ---
 # First, determine the union of all categories across LLMs.
 # Prepare data for heatmap
 category_pass_k = {}
 for llm, res in llm_results.items():
    for cat, kdict in res["categories"].items():
        if cat not in category_pass_k:
            category_pass_k[cat] = {}
        category_pass_k[cat][llm] = kdict[15]  # Using Pass@15
 # Convert to DataFrame
 df_heatmap = pd.DataFrame.from_dict(category_pass_k).T
 for k in ks:
    # Convert to DataFrame
    df_heatmap = pd.DataFrame.from_dict(category_pass_k).T
    # Plot heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(df_heatmap, annot=True, cmap="Blues", linewidths=0.5, fmt=".2f")
    plt.title("Pass@15 Heatmap for Each LLM Across Categories", fontsize=16, fontweight="bold")
    plt.xlabel("LLM", fontsize=14, fontweight="bold")
    plt.ylabel("Category", fontsize=14, fontweight="bold")
    plt.xticks(rotation=45, ha="right", fontsize=12)
    plt.yticks(fontsize=12)
    plt.tight_layout()
    heatmap_path = f"./figures/per_category_pass_k{k}_heatmap.png"
    plt.savefig(heatmap_path)
 # --- (Optional) Print the computed results ---
 print("Overall Pass@k per LLM:")
 for llm, res in llm_results.items():
    print(f"{llm}: {res['overall']}")
 print("\nPer-Category Pass@k per LLM:")
 for llm, res in llm_results.items():
    print(f"{llm}:")
    for cat, kdict in res["categories"].items():
        print(f"  {cat}: {kdict}")
--- a/evaluate/solution_pass_analysis.csv
+++ b/evaluate/solution_pass_analysis.csv
@@ -0,0 +1,13 @@
 ,gpt-3.5-turbo,gpt-4,gpt-4o,gpt-o1-mini,llama3.1-405B,qwen-max,qwen-plus,qwen2.5-coder-32B-instruct,codestral
 Combinational Logic,112 | 5 | 3,117 | 3 | 0,120 | 0 | 0,118 | 1 | 1,115 | 2 | 3,117 | 2 | 1,109 | 1 | 10,112 | 2 | 6,120 | 0 | 0
 Finite State Machines,23 | 15 | 22,32 | 22 | 6,31 | 24 | 5,39 | 18 | 3,31 | 24 | 5,34 | 26 | 0,27 | 23 | 10,39 | 10 | 11,36 | 6 | 18
 Mathematical Functions,13 | 19 | 43,6 | 39 | 30,36 | 10 | 29,46 | 24 | 5,7 | 6 | 62,26 | 27 | 22,20 | 26 | 29,5 | 8 | 62,0 | 3 | 72
 Basic Arithmetic Operations,37 | 2 | 36,63 | 8 | 4,66 | 9 | 0,68 | 4 | 3,43 | 2 | 30,38 | 22 | 15,27 | 13 | 35,54 | 6 | 15,62 | 13 | 0
 Bitwise and Logical Operations,35 | 0 | 25,55 | 0 | 5,58 | 2 | 0,59 | 0 | 1,52 | 0 | 8,47 | 0 | 13,33 | 11 | 16,36 | 0 | 24,55 | 0 | 5
 Pipelining,0 | 59 | 16,11 | 54 | 10,26 | 49 | 0,15 | 38 | 22,7 | 38 | 30,15 | 32 | 28,16 | 26 | 33,21 | 31 | 23,6 | 56 | 13
 Polynomial Evaluation,19 | 3 | 53,69 | 0 | 6,74 | 1 | 0,68 | 5 | 2,58 | 6 | 11,55 | 2 | 18,28 | 5 | 42,65 | 7 | 3,69 | 6 | 0
 Machine Learning,31 | 3 | 41,60 | 8 | 7,60 | 13 | 2,73 | 1 | 1,45 | 28 | 2,63 | 12 | 0,61 | 12 | 2,57 | 2 | 16,64 | 8 | 3
 Financial Computing,9 | 23 | 28,21 | 22 | 17,29 | 13 | 18,20 | 20 | 20,11 | 21 | 28,28 | 15 | 17,15 | 12 | 33,16 | 7 | 37,17 | 23 | 20
 Encryption,30 | 0 | 15,30 | 2 | 13,25 | 20 | 0,30 | 0 | 15,26 | 0 | 19,25 | 9 | 11,30 | 1 | 14,30 | 0 | 15,30 | 0 | 15
 Physics,45 | 3 | 12,57 | 0 | 3,53 | 4 | 3,54 | 5 | 1,41 | 11 | 8,49 | 7 | 4,40 | 17 | 3,38 | 15 | 7,55 | 2 | 3
 Climate,8 | 15 | 37,21 | 30 | 9,41 | 11 | 8,41 | 15 | 4,24 | 23 | 13,38 | 19 | 3,19 | 31 | 10,32 | 14 | 14,28 | 19 | 13
--- a/evaluate/solution_resource_analysis.csv
+++ b/evaluate/solution_resource_analysis.csv
@@ -0,0 +1,57 @@
 ,gpt-3.5-turbo,gpt-4,gpt-4o,gpt-o1-mini,llama3.1-405B,qwen-max,qwen-plus,qwen2.5-coder-32B-instruct,codestral
 parity 8bit,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
 mux4to1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
 majority,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
 bin to gray,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
 eq comparator,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
 decoder 2to4,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
 seven segment decoder,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
 priority encoder,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
 fsm 3state,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
 traffic light,1.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,inf
 elevator controller,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
 vending machine,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0
 int sqrt,inf,inf,68.0,177.0,inf,64.0,229.0,173.0,inf
 fibonacci,inf,56.0,1.0,56.0,56.0,56.0,inf,inf,inf
 mod exp,inf,inf,4466.0,4669.0,inf,1911.0,1678.0,inf,inf
 power,inf,79.0,74.0,93.0,inf,93.0,93.0,93.0,inf
 log2 int,inf,inf,inf,10.0,20.0,inf,inf,12.0,inf
 add 8bit,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
 mult 4bit,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
 abs diff,12.0,12.0,14.0,12.0,12.0,inf,12.0,12.0,12.0
 modulo op,82.0,82.0,82.0,82.0,111.0,inf,inf,inf,inf
 subtract 8bit,8.0,8.0,8.0,8.0,inf,inf,inf,8.0,8.0
 bitwise ops,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
 left shift,10.0,10.0,10.0,10.0,10.0,12.0,12.0,10.0,10.0
 bitwise not,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
 rotate left,inf,12.0,12.0,12.0,12.0,12.0,inf,12.0,12.0
 pipelined adder,inf,0.0,16.0,inf,0.0,inf,0.0,15.0,inf
 pipelined multiplier,inf,inf,77.0,70.0,56.0,inf,70.0,inf,inf
 pipelined accumulator,inf,inf,inf,inf,27.0,inf,inf,inf,inf
 pipelined max finder,inf,0.0,24.0,0.0,24.0,24.0,24.0,24.0,24.0
 pipelined fir,inf,inf,inf,inf,inf,inf,inf,inf,inf
 polynomial 1,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
 polynomial 2,49.0,49.0,0.0,91.0,0.0,91.0,0.0,91.0,49.0
 polynomial 3,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
 polynomial 4,64.0,33.0,96.0,11.0,108.0,108.0,26.0,18.0,33.0
 polynomial 5,inf,0.0,213.0,59.0,16.0,213.0,16.0,16.0,16.0
 matrix vector mult,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
 relu,8.0,8.0,8.0,8.0,8.0,16.0,8.0,8.0,16.0
 gradient descent,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0
 mse loss,inf,216.0,64.0,64.0,216.0,64.0,216.0,64.0,64.0
 conv2d,inf,0.0,0.0,0.0,inf,0.0,0.0,0.0,0.0
 compound interest,inf,13060.0,10135.0,10135.0,52950.0,9247.0,inf,10135.0,52950.0
 ddm,inf,815.0,inf,inf,inf,inf,inf,inf,inf
 present value,107946.0,107946.0,107946.0,107946.0,107946.0,107946.0,107946.0,107946.0,107946.0
 currency converter,inf,inf,0.0,0.0,25.0,0.0,inf,inf,inf
 caesar cipher,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
 modular add cipher,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
 feistel cipher,inf,inf,inf,inf,inf,inf,inf,inf,inf
 free fall distance,6.0,6.0,64.0,6.0,6.0,64.0,67.0,64.0,6.0
 kinetic energy,70.0,70.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
 potential energy,6.0,6.0,84.0,0.0,6.0,6.0,6.0,6.0,6.0
 wavelength,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
 carbon footprint,174.0,121.0,110.0,92.0,121.0,121.0,110.0,110.0,110.0
 heat index,16.0,16.0,201.0,16.0,195.0,16.0,124.0,201.0,201.0
 air quality index,inf,inf,128.0,104.0,inf,104.0,116.0,128.0,128.0
 solar radiation average,inf,inf,44.0,44.0,44.0,44.0,inf,44.0,inf
--- a/figures/overall_pass_at_k.png
+++ b/figures/overall_pass_at_k.png
--- a/functional_correctness.py
+++ b/functional_correctness.py
@@ -0,0 +1,131 @@
 import json
 import os
 import re
 import subprocess
 # File paths
 SOLUTIONS_FILE = "solutions.json"
 PROBLEMS_FILE = "problems.json"
 TEMP_VERILOG_FILE = "temp.v"
 TEMP_TESTBENCH_FILE = "testbench.v"
 TCL_SCRIPT_FILE = "run_testbench.tcl"
 def write_tcl():
        # Generate the TCL script for Vivado
    tcl_commands = f"""
    create_project temp_project ./temp_project -force -part xc7z020clg400-1
    set_property source_mgmt_mode All [current_project]
    add_files {TEMP_VERILOG_FILE}
    add_files -fileset sim_1 {TEMP_TESTBENCH_FILE}
    set_property top {top_module} [get_filesets sim_1]
    launch_simulation -simset sim_1 -mode behavioral
    run 3000ns
    close_sim
    exit
    """
    # Write the Tcl script
    with open(TCL_SCRIPT_FILE, "w", encoding="utf-8") as file:
        file.write(tcl_commands)
 # Function to extract the top module name from the testbench
 def extract_top_module_name(testbench_file):
    with open(testbench_file, 'r', encoding="utf-8") as file:
        for line in file:
            match = re.search(r'\s*module\s+(\w+)\s*;', line)
            if match:
                print(match.group(1))
                return match.group(1)  # Extract module name
    return None  # Return None if no module found
 def run_functional_correctness():
    # Load JSON files
    with open(SOLUTIONS_FILE, "r", encoding="utf-8") as file:
        solutions_data = json.load(file)
    with open(PROBLEMS_FILE, "r", encoding="utf-8") as file:
        problems_data = json.load(file)
    # Map module names to their testbenches
    module_testbenches = {}
    for category, problems in problems_data.items():
        for problem in problems:
            module_name = problem.get("module")
            testbench_code = problem.get("Testbench")
            if module_name and testbench_code:
                module_testbenches[module_name] = testbench_code
    # print(module_testbenches.keys())
    # Get Vivado path from environment variable
    vivado_path = os.environ.get("vivado")
    if not vivado_path:
        raise EnvironmentError("Vivado environment variable not set.")
    vivado_path = os.path.join(vivado_path, "vivado.bat")
    # Iterate over solutions and test them
    for model, categories in solutions_data.items():
        for category, modules in categories.items():
            for module_entry in modules:
                module_name = module_entry["module"]
                # print(module_name)
                # print(module_name in module_testbenches.keys())
                if module_name not in module_testbenches:
                    print(f"Skipping {module_name}: No testbench found.")
                    continue
                testbench_code = module_testbenches[module_name]
                solutions = module_entry["solutions"]
                # Iterate over all solutions
                for solution_entry in solutions:
                    verilog_code = solution_entry["solution"]
                    # Write the Verilog design to a file
                    with open(TEMP_VERILOG_FILE, "w", encoding="utf-8") as f:
                        f.write(verilog_code)
                    # Write the testbench to a file
                    with open(TEMP_TESTBENCH_FILE, "w", encoding="utf-8") as f:
                        f.write(testbench_code)
                    # Extract the top module name
                    top_module = extract_top_module_name(TEMP_TESTBENCH_FILE)
                    if not top_module:
                        print(f"Error: Could not extract top module from {module_name}. Skipping...")
                        solution_entry["pass"] = "Error: Could not extract top module."
                        continue
                    print(f"Testing module: {module_name} (Top Module: {top_module})")
                    write_tcl()
                    # Run Vivado in batch mode
                    print(f"Running Vivado simulation for {module_name}...")
                    process = subprocess.run([vivado_path, "-mode", "batch", "-source", TCL_SCRIPT_FILE], capture_output=True, text=True)
                    # Capture output logs
                    output_log = process.stdout + "\n" + process.stderr
                    print(output_log)
                    test_passed = "All tests passed" in output_log
                    # Determine pass/fail status
                    if test_passed:
                        solution_entry["pass"] = "true"
                    else:
                        # Extract relevant error messages
                        error_lines = "\n".join(line for line in output_log.split("\n") if "error" or "fail" in line.lower())
                        solution_entry["pass"] = error_lines if error_lines else "Test failed somehow"
                    print(f"Test result for {module_name}: {'PASS' if test_passed else 'FAIL'}")
                    # Save results after testing each module
                    with open(SOLUTIONS_FILE, "w", encoding="utf-8") as file:
                        json.dump(solutions_data, file, indent=4)
    print("All tests completed.")
--- a/generate_solutions.py
+++ b/generate_solutions.py
@@ -0,0 +1,111 @@
 import json
 import os
 import re
 from openai import OpenAI
 def load_prompt_data(filepath: str) -> dict:
    """
    Loads the prompt data from JSON.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)
 def load_solutions(filepath: str) -> dict:
    """
    Loads the existing solutions JSON, or returns a default if file not found.
    """
    if os.path.exists(filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}
 def save_solutions(filepath: str, solutions: dict):
    """
    Saves the solutions dictionary to the solutions.json file (pretty-printed).
    """
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(solutions, f, indent=4)
 def call_LLMs(client, model: str, problem: str, module_header: str) -> str:
    """
    Calls the OpenAI chat completion endpoint with the given prompt.
    """
    prompt = f"""
    Here we assume the SystemVerilog is not supported, so don't use the SystemVerilog syntax, such as break statement.
    Please write a Verilog module that solves the following problem efficiently, using the exact module header below:
    Problem:
    {problem}
    Module header (must not be changed):
    {module_header}
    Remember to return only the JSON format:
    {{
    "solution": "<verilog code>"
    }}
    """
    try:
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful Verilog coding assistant. Please return a JSON object with a key 'solution' containing the Verilog code."},
                {"role": "user", "content": prompt}
            ],
            model=model,
            max_tokens=3000,
            temperature=1.5,
            top_p=0.75,
        )
        response_content = response.choices[0].message.content.strip()
        return response_content
    except Exception as e:
        print("Error:", str(e))
        return json.dumps({"solution": f"Error: {str(e)}"})
 def generate_solutions(api_key: str, model_name: str, k: int, prompt_json_file: str = "problems.json", solutions_json_file: str = "solutions.json"):
    """
    Generates Verilog solutions for problems using an LLM.
    """
    # Initialize OpenAI client
    client = OpenAI(api_key=api_key)
    # Load the problem data
    prompt_data = load_prompt_data(prompt_json_file)
    # Load or initialize solutions data
    solutions_data = load_solutions(solutions_json_file)
    if model_name not in solutions_data:
        solutions_data[model_name] = {}
    for _ in range(k):
        for category, problems in prompt_data.items():
            if category not in solutions_data[model_name]:
                solutions_data[model_name][category] = []
            for item in problems:
                problem_statement = item.get("Problem", "")
                module_header = item.get("Module header", "")
                module_name = item.get("module")
                response_json_str = call_LLMs(client, model_name, problem_statement, module_header)
                response_json_str = response_json_str.strip('`').replace('json', '').replace('```', '')
                try:
                    response_json = json.loads(response_json_str)
                    verilog_code = response_json.get("solution", "")
                except json.JSONDecodeError:
                    print(response_json_str)
                    verilog_code = "Error: Invalid JSON response"
                print(f"Processing module: {module_name}")
                category_list = solutions_data[model_name][category]
                module_entry = next((entry for entry in category_list if entry.get("module") == module_name), None)
                if module_entry is None:
                    module_entry = {"module": module_name, "solutions": []}
                    category_list.append(module_entry)
                module_entry["solutions"].append({"solution": verilog_code, "pass": ""})
                save_solutions(solutions_json_file, solutions_data)
--- a/problems.json
+++ b/problems.json
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,4 @@
 matplotlib==3.10.0
 openai==1.63.2
 pandas==2.2.3
 seaborn==0.13.2
--- a/resource_usage.py
+++ b/resource_usage.py
@@ -0,0 +1,194 @@
 import json
 import subprocess
 import os
 import re
 def extract_module_name(verilog_code):
    """
    Extract the module name from the Verilog code.
    Assumes the module declaration is of the form:
        module <module_name> (
    Returns the module name as a string, or None if not found.
    """
    match = re.search(r'\bmodule\s+(\w+)', verilog_code)
    if match:
        return match.group(1)
    return None
 def parse_optimized(lines):
    """
    Extract resource usage numbers from the main (optimized) report sections.
    Returns a dictionary with keys: LUT, FF, DSP, BRAM, IO.
    """
    optimized = {"LUT": None, "FF": None, "DSP": None, "BRAM": None, "IO": None}
    for line in lines:
        m = re.search(r'\|\s*Slice LUTs\*?\s*\|\s*(\d+)', line)
        if m:
            optimized["LUT"] = int(m.group(1))
        m = re.search(r'\|\s*Slice Registers\s*\|\s*(\d+)', line)
        if m:
            optimized["FF"] = int(m.group(1))
        m = re.search(r'\|\s*DSPs\s*\|\s*(\d+)', line)
        if m:
            optimized["DSP"] = int(m.group(1))
        m = re.search(r'\|\s*Block RAM Tile\s*\|\s*(\d+)', line)
        if m:
            optimized["BRAM"] = int(m.group(1))
        m = re.search(r'\|\s*Bonded IOB\s*\|\s*(\d+)', line)
        if m:
            optimized["IO"] = int(m.group(1))
    return optimized
 def extract_primitives_section(lines):
    """
    Extracts all lines between the "7. Primitives" header and the "8. Black Boxes" header.
    """
    start_marker = "7. Primitives"
    end_marker   = "8. Black Boxes"
    start_idx = None
    end_idx = None
    for idx, line in enumerate(lines):
        if start_idx is None and start_marker in line and (idx + 1 < len(lines) and "------" in lines[idx + 1]):
            start_idx = idx
        elif start_idx is not None and end_marker in line and (idx + 1 < len(lines) and "------" in lines[idx + 1]):
            end_idx = idx
            break
    if start_idx is None or end_idx is None:
        return []
    return lines[start_idx:end_idx]
 def parse_primitives_section(lines):
    """
    Parses the primitives section lines to accumulate resource usage.
    Returns a dictionary with keys: LUT, FF, DSP, BRAM, IO.
    In this example:
      - For LUT: sums up any primitive whose name starts with "LUT" (e.g., LUT2, LUT3, ...)
      - For IO: sums the usage of IBUF and OBUF.
    """
    resources = {"LUT": 0, "FF": 0, "DSP": 0, "BRAM": 0, "IO": 0}
    for line in lines:
        stripped_line = line.strip()
        if not stripped_line.startswith("|"):
            continue
        parts = stripped_line.split("|")
        if len(parts) < 4:
            continue
        ref_name = parts[1].strip()
        used_str = parts[2].strip()
        try:
            used = int(used_str)
        except ValueError:
            continue
        if ref_name.startswith("LUT"):
            resources["LUT"] += used
        if ref_name in ("IBUF", "OBUF"):
            resources["IO"] += used
        # (Add additional processing for FF, DSP, BRAM if necessary.)
    return resources
 def run_synthesis(solution_code):
    """
    Writes the given Verilog solution to a temporary file,
    creates a Tcl script for Vivado to run synthesis and generate a utilization report,
    runs Vivado in batch mode, and parses the resource usage report.
    Returns a dictionary with keys "optimized" and "primitives" containing resource usage.
    """
    # Write the Verilog code to a temporary file.
    verilog_file = "temp.v"
    with open(verilog_file, "w") as f:
        f.write(solution_code)
    # Extract the module name from the solution code.
    top_module = extract_module_name(solution_code)
    print(top_module)
    if top_module is None:
        print("Could not extract module name; using 'temp_top' as a default.")
        top_module = "temp_top"
    vivado_project = "temp_project"
    tcl_script = "synthesis_script.tcl"
    # Get the Vivado installation path from the environment variable.
    vivado_path_env = os.environ.get("vivado")
    if vivado_path_env is None:
        print("Error: 'vivado' environment variable is not set.")
        return None
    vivado_path = os.path.join(vivado_path_env, "vivado.bat")
    # Create the Vivado Tcl script.
    tcl_commands = f"""
    create_project {vivado_project} -force -part xc7z020clg400-1
    add_files {verilog_file}
    set_property top {top_module} [current_fileset]
    # Run synthesis only (no simulation)
    synth_design -top {top_module}
    # Generate resource utilization report
    report_utilization -file resource_usage.rpt
    quit
    """
    with open(tcl_script, "w") as file:
        file.write(tcl_commands)
    # Run Vivado in batch mode using the generated Tcl script.
    try:
        result = subprocess.run(
            [vivado_path, "-mode", "batch", "-source", tcl_script],
            capture_output=True, text=True, check=True
        )
    except subprocess.CalledProcessError as e:
        print("Synthesis failed:", e)
        return None
    print(result.stdout)
    # Check for the success message in the output.
    if "Finished Writing Synthesis Report" in result.stdout:
        # Read the resource utilization report.
        with open("resource_usage.rpt", "r") as f:
            report_lines = f.readlines()
        optimized_resources = parse_optimized(report_lines)
        primitives_section = extract_primitives_section(report_lines)
        primitives_resources = (parse_primitives_section(primitives_section)
                                  if primitives_section else {})
        return {"optimized": optimized_resources, "primitives": primitives_resources}
    else:
        print("Synthesis did not complete successfully.")
        return None
 def run_resource_usage():
    # Load the original JSON.
    input_json_file = "solutions.json"  # Update this file name if needed.
    with open(input_json_file, "r") as f:
        data = json.load(f)
    # Traverse all top-level keys (e.g., "4o") and all subcategories.
    for top_key, top_value in data.items():
        # print(top_value.keys())
        # exit()
        # top_value should be a dict with categories (e.g., "Combinational Logic", "Finite State Machines", etc.)
        for category, module_list in top_value.items():
            # if category == "Combinational Logic":
            #     continue
            for module in module_list:
                for sol in module["solutions"]:
                    if sol.get("pass", "").strip().lower() == "true":
                        solution_code = sol["solution"]
                        print(f"Running synthesis for module '{module['module']}' in category '{category}'")
                        resource_usage = run_synthesis(solution_code)
                        if resource_usage:
                            sol["resource usage"] = resource_usage
                        else:
                            sol["resource usage"] = {"optimized": {}, "primitives": {}}
                    else:
                        sol["resource usage"] = {"optimized": {}, "primitives": {}}
                    # Write the updated JSON (with resource usage added) to a new file.
                    output_json_file = "solutions.json"
                    with open(output_json_file, "w") as f:
                        json.dump(data, f, indent=4)
                    print(f"Updated JSON written to {output_json_file}")
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,42 @@
 import argparse
 import subprocess
 from generate_solutions import generate_solutions
 from functional_correctness import run_functional_correctness, run_resource_usage
 def main():
    parser = argparse.ArgumentParser(description="Command-line interface for Verilog solution generation and evaluation.")
    parser.add_argument("-generate_solutions", nargs=3, metavar=("MODEL_NAME", "K", "API_KEY"), help="Generate Verilog solutions using the specified model, number of iterations, and API key.")
    parser.add_argument("-functional_correctness", action="store_true", help="Run functional correctness evaluation.")
    parser.add_argument("-resource_usage", action="store_true", help="Run resource usage evaluation.")
    args = parser.parse_args()
    if args.generate_solutions:
        model_name, k, api_key = args.generate_solutions
        generate_solutions(api_key, model_name, int(k))
        if args.functional_correctness:
            run_functional_correctness()
            subprocess.run(["python", "./evaluate/count_pass.py"])
            subprocess.run(["python", "./evaluate/plot_pass.py"])
            if args.resource_usage:
                run_resource_usage()
                subprocess.run(["python", "./evaluate/count_resource.py"])
    else:
        if args.functional_correctness:
            run_functional_correctness()
            subprocess.run(["python", "./evaluate/count_pass.py"])
            subprocess.run(["python", "./evaluate/plot_pass.py"])
            if args.resource_usage:
                run_resource_usage()
                subprocess.run(["python", "./evaluate/count_resource.py"])
        if args.resource_usage:
            run_resource_usage()
            subprocess.run(["python", "./evaluate/count_resource.py"])
 if __name__ == "__main__":
    main()
--- a/solutions/sample.json
+++ b/solutions/sample.json
@@ -0,0 +1,60 @@
 {
    "gpt-3.5-turbo": {
        "Combinational Logic": [
            {
                "module": "parity_8bit",
                "solutions": [
                    {
                        "solution": "module parity_8bit (input [7:0] in, output out); assign out = in[0] ^ in[1] ^ in[2] ^ in[3] ^ in[4] ^ in[5] ^ in[6] ^ in[7]; endmodule",
                        "pass": "true",
                        "resource usage": {
                            "optimized": {
                                "LUT": 2,
                                "FF": 0,
                                "DSP": 0,
                                "BRAM": 0,
                                "IO": 9
                            },
                            "primitives": {
                                "LUT": 2,
                                "FF": 0,
                                "DSP": 0,
                                "BRAM": 0,
                                "IO": 9
                            }
                        }
                    },
                    {
                        "solution": "module parity_8bit (input [7:0] in, output out); reg parity; integer i; always @(*) begin parity = 1'b0; for(i=0; i<8; i=i+1) begin if(in[i] == 1'b1) parity = ~parity; end end assign out = parity; endmodule",
                        "pass": "true",
                        "resource usage": {
                            "optimized": {
                                "LUT": 2,
                                "FF": 0,
                                "DSP": 0,
                                "BRAM": 0,
                                "IO": 9
                            },
                            "primitives": {
                                "LUT": 2,
                                "FF": 0,
                                "DSP": 0,
                                "BRAM": 0,
                                "IO": 9
                            }
                        }
                    }
                ]
            }
        ],
        "Finite State Machines": [
            {
                "module": "fsm_3state",
                "solutions": []
            }
        ]
    },
    "gpt-4o":{
    }
 }
--- a/solutions/solutions.json
+++ b/solutions/solutions.json