load_dotenv()True
fc.test_eq(score_answer_value("is_blank", "is_blank"), True)
fc.test_eq(score_answer_value(["is_blank"], "is_blank"), False)
fc.test_eq(score_answer_value("ML.ENERGY Benchmark", "The ML.ENERGY Benchmark"), False)
fc.test_eq(score_answer_value("ML.ENERGY Benchmark", "ML.ENERGY Benchmark"), True)
fc.test_eq(score_answer_value("4.3", "4.3"), True)
fc.test_eq(score_answer_value("4.3", "4.13"), False)predicted_answer = {
"answer": "False",
"answer_unit": "is_blank",
"answer_value": "is_blank",
"explanation": "Quote",
"ref_id": "is_blank",
"ref_url": "is_blank",
"supporting_materials": "is_blank"
}
expected_answer = {
"answer": "False",
"answer_unit": "is_blank",
"answer_value": "is_blank",
"explanation": "Quote",
"ref_id": "is_blank",
"ref_url": "is_blank",
"supporting_materials": "is_blank"
}
fc.test_eq(score_is_na(predicted_answer, expected_answer), True)predicted_answer = {
"answer": "False",
"answer_unit": "is_blank",
"answer_value": "0",
"explanation": "Quote",
"ref_id": "is_blank",
"ref_url": "is_blank",
"supporting_materials": "The limited availability of this data significantly reduces transparency and accountability, thereby weakening the potential for public oversight and market responses."
}
fc.test_eq(score_is_na(predicted_answer, expected_answer), False)predicted_answer = {
"answer": "Unanswerable from the context.",
"answer_unit": "MWh",
"answer_value": "is_blank",
"explanation": "is_blank",
"ref_id": "is_blank",
"ref_url": "is_blank",
"supporting_materials": "is_blank"
}
expected_answer = {
"answer": "Unable to answer with confidence based on the ...",
"answer_unit": "MWh",
"answer_value": "is_blank",
"explanation": "is_blank",
"ref_id": "is_blank",
"ref_url": "is_blank",
"supporting_materials": "is_blank"
}
ws = calculate_wattbot_score(predicted_answer, expected_answer)
fc.test_eq(ws.na_score, 0.0)
fc.test_eq(ws.answer_score, 1.0)
fc.test_eq(ws.ref_score, 1.0)
fc.test_eq(ws.score, 0.9)predicted_answer = {
"answer": "Local inference was emphasized as a sustainability measure because it reduces both network overhead and carbon footprint when deploying large language models.",
"answer_unit": "is_blank",
"answer_value": "1",
"explanation": "is_blank",
"ref_id": ["khan2025"],
"ref_url": ["https://arxiv.org/pdf/2504.06307"],
"supporting_materials": "The proposed framework tackles energy efficiency in LLM deployment through three interconnected components: local inference optimization, the selection of energy-efficient LLMs, and a comprehensive evaluation methodology."
}
expected_answer = {
"answer": "Local inference was emphasized as a sustainability measure because it reduces both network overhead and carbon footprint when deploying large language models.",
"answer_unit": "is_blank",
"answer_value": "1",
"explanation": "is_blank",
"ref_id": ["khan2025"],
"ref_url": ["https://arxiv.org/pdf/2504.06307"],
"supporting_materials": "The proposed framework tackles energy efficiency in LLM deployment through three interconnected components: local inference optimization, the selection of energy-efficient LLMs, and a comprehensive evaluation methodology."
}
ws = calculate_wattbot_score(predicted_answer, expected_answer)
fc.test_eq(ws.na_score, 1.0)
fc.test_eq(ws.answer_score, 1.0)
fc.test_eq(ws.ref_score, 1.0)
fc.test_eq(ws.score, 1.0)Processing Rows: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [01:35<00:00, 2.33s/it]
32.7
Answering question: 0%| | 0/282 [00:00<?, ?it/s]/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'The total energy consumption of U.S. data centers increased by about 4% from 2010-2014' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
df.loc[i, 'answer'] = str(answer['answer'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:10: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '4' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
df.loc[i, 'answer_value'] = str(answer['answer_value'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:12: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '['wu2021b']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
df.loc[i, 'ref_id'] = str(answer['ref_id'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:13: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '['https://arxiv.org/pdf/2108.06738']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
df.loc[i, 'ref_url'] = str(answer['ref_url'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:14: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'The total energy consumption of the US data centers increased by about 4% from 2010-2014' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
df.loc[i, 'supporting_materials'] = str(answer['supporting_materials'])
/var/folders/wh/7kgzwj895cb622q6m52_0v3r0000gn/T/ipykernel_4022/2808968084.py:15: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Chunk 9 from wu2021b explicitly states that U.S. data center energy consumption increased by about 4% between 2010-2014, which directly answers the question about the average increase in electricity consumption during this period.' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
df.loc[i, 'explanation'] = str(answer['explanation'])
Answering question: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [10:45<00:00, 2.29s/it]