Eval

load_dotenv()
True

Score Answer Value

fc.test_eq(is_numeric("1.0"), True)

fc.test_eq(is_numeric("a"), False)
fc.test_eq(score_answer_value("is_blank", "is_blank"), True)

fc.test_eq(score_answer_value(["is_blank"], "is_blank"), False)

fc.test_eq(score_answer_value("ML.ENERGY Benchmark", "The ML.ENERGY Benchmark"), False)

fc.test_eq(score_answer_value("ML.ENERGY Benchmark", "ML.ENERGY Benchmark"), True)

fc.test_eq(score_answer_value("4.3", "4.3"), True)

fc.test_eq(score_answer_value("4.3", "4.13"), False)

Score Ref ID

fc.test_eq(score_ref_id("is_blank", "is_blank"), True)

fc.test_eq(score_ref_id(["is_blank"], "is_blank"), False)

fc.test_eq(score_ref_id(["patterson2021"], ["patterson2021"]), True)

fc.test_eq(score_ref_id(["patterson2021"], '["patterson2021"]'), True)

Score is_na

predicted_answer = {
  "answer": "False",
  "answer_unit": "is_blank",
  "answer_value": "is_blank",
  "explanation": "Quote",
  "ref_id": "is_blank",
  "ref_url": "is_blank",
  "supporting_materials": "is_blank"
}

expected_answer = {
  "answer": "False",
  "answer_unit": "is_blank",
  "answer_value": "is_blank",
  "explanation": "Quote",
  "ref_id": "is_blank",
  "ref_url": "is_blank",
  "supporting_materials": "is_blank"
}

fc.test_eq(score_is_na(predicted_answer, expected_answer), True)
predicted_answer = {
  "answer": "False",
  "answer_unit": "is_blank",
  "answer_value": "0",
  "explanation": "Quote",
  "ref_id": "is_blank",
  "ref_url": "is_blank",
  "supporting_materials": "The limited availability of this data significantly reduces transparency and accountability, thereby weakening the potential for public oversight and market responses."
}

fc.test_eq(score_is_na(predicted_answer, expected_answer), False)

Wattbot Score

predicted_answer = {
    "answer": "Unanswerable from the context.",
    "answer_unit": "MWh",
    "answer_value": "is_blank",
    "explanation": "is_blank",
    "ref_id": "is_blank",
    "ref_url": "is_blank",
    "supporting_materials": "is_blank"
}

expected_answer = {
    "answer": "Unable to answer with confidence based on the ...",
    "answer_unit": "MWh",
    "answer_value": "is_blank",
    "explanation": "is_blank",
    "ref_id": "is_blank",
    "ref_url": "is_blank",
    "supporting_materials": "is_blank"
}

ws = calculate_wattbot_score(predicted_answer, expected_answer)
fc.test_eq(ws.na_score, 0.0)
fc.test_eq(ws.answer_score, 1.0)
fc.test_eq(ws.ref_score, 1.0)
fc.test_eq(ws.score, 0.9)
predicted_answer = {
    "answer": "Local inference was emphasized as a sustainability measure because it reduces both network overhead and carbon footprint when deploying large language models.",
    "answer_unit": "is_blank",
    "answer_value": "1",
    "explanation": "is_blank",
    "ref_id": ["khan2025"],
    "ref_url": ["https://arxiv.org/pdf/2504.06307"],
    "supporting_materials": "The proposed framework tackles energy efficiency in LLM deployment through three interconnected components: local inference optimization, the selection of energy-efficient LLMs, and a comprehensive evaluation methodology."
}

expected_answer = {
    "answer": "Local inference was emphasized as a sustainability measure because it reduces both network overhead and carbon footprint when deploying large language models.",
    "answer_unit": "is_blank",
    "answer_value": "1",
    "explanation": "is_blank",
    "ref_id": ["khan2025"],
    "ref_url": ["https://arxiv.org/pdf/2504.06307"],
    "supporting_materials": "The proposed framework tackles energy efficiency in LLM deployment through three interconnected components: local inference optimization, the selection of energy-efficient LLMs, and a comprehensive evaluation methodology."
}

ws = calculate_wattbot_score(predicted_answer, expected_answer)
fc.test_eq(ws.na_score, 1.0)
fc.test_eq(ws.answer_score, 1.0)
fc.test_eq(ws.ref_score, 1.0)
fc.test_eq(ws.score, 1.0)

Evaluation

all_chunks = retriever.chunk_all(retriever.chunk_doc)
ls = retriever.LexicalSearch(all_chunks)
rag = generator.RAG(ls, utils.fw(), model='accounts/fireworks/models/kimi-k2p5')
experiment_metadata = {
    'pdf_extraction': 'pypdf',
    'chunking': 'character_level',
    'chunk_size': 1500,
    'chunk_step': 1400,
    'retrieval': 'lexical_search'
}

evaluate_train(rag, experiment_metadata)
Processing Rows: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [01:35<00:00,  2.33s/it]
32.7

Test

experiment_metadata['output_path'] = 'submission_v1.csv'
create_submission(rag, experiment_metadata)
Answering question:   0%|                                                                                                                                                                    | 0/282 [00:00<?, ?it/s]/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'The total energy consumption of U.S. data centers increased by about 4% from 2010-2014' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[i, 'answer'] = str(answer['answer'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:10: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '4' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[i, 'answer_value'] = str(answer['answer_value'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:12: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '['wu2021b']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[i, 'ref_id'] = str(answer['ref_id'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:13: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '['https://arxiv.org/pdf/2108.06738']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[i, 'ref_url'] = str(answer['ref_url'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:14: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'The total energy consumption of the US data centers increased by about 4% from 2010-2014' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[i, 'supporting_materials'] = str(answer['supporting_materials'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:15: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Chunk 9 from wu2021b explicitly states that U.S. data center energy consumption increased by about 4% between 2010-2014, which directly answers the question about the average increase in electricity consumption during this period.' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[i, 'explanation'] = str(answer['explanation'])
Answering question: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [10:45<00:00,  2.29s/it]