Initial Development with Solveit

Defining imports

import os
import openai
import pandas as pd
import fastcore.all as fc
import contextkit.read as rd
import fireworks as fw
import braintrust
import time

import ast
from types import SimpleNamespace
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance, VectorParams, PointStruct
from fastembed import SparseTextEmbedding
/usr/local/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
em = "nomic-ai/nomic-embed-text-v1.5" # embedding model
# em = "accounts/fireworks/models/qwen3-embedding-8b"

Document Processor

class Chunk(fc.NS):
    def __init__(self, chunk_id, doc_id, text, **metadata):
        super().__init__(chunk_id=chunk_id, doc_id=doc_id, text=text, **metadata)
    
    def __repr__(self):
        meta = "\n".join([f"\033[1;35m{k}\033[0m: {str(v)[:100]}" for k, v in self.__dict__.items()])
        return f"{meta}"
Chunk('WattBot', 0, 0, **{'title': 'RAG'})
chunk_id: WattBot
doc_id: 0
text: 0
title: RAG

How to make the keys bold?

🤖Reply🤖

To make the keys bold in the __repr__ output, you can wrap them in ANSI escape codes or markdown-style formatting. Here’s a simple approach using ANSI codes for terminal output:

def __repr__(self):
    meta = "\n".join([f"\033[1m{k}\033[0m: {v}" for k, v in self.__dict__.items() if k != '__stored_args__'])
    return f"{meta}"

The \033[1m starts bold text and \033[0m resets it.

Would you like to try this, or would you prefer a different formatting approach?

Can it be colorize?

🤖Reply🤖

Yes! You can use different ANSI color codes. For example:

f"\033[1;34m{k}\033[0m: {v}"  # Bold blue
f"\033[1;32m{k}\033[0m: {v}"  # Bold green
f"\033[1;35m{k}\033[0m: {v}"  # Bold magenta

The format is \033[1;COLORm where common color codes are: - 31: red - 32: green - 33: yellow - 34: blue - 35: magenta - 36: cyan

Which color would you like to try?

import fastcore.all as fc
import contextkit.read as rd

class DocumentProcessor:
    def __init__(self, data_path='wattbot_data', chunk_size=1500, step=1400):
        fc.store_attr()
    
    def stream(self, doc_id, metadata=None):
        content = rd.read_pdf(fc.Path(self.data_path)/f'{doc_id}.pdf')
        for i in range(0, len(content), self.step): yield Chunk(text=content[i:i+self.chunk_size], chunk_id=i, doc_id=doc_id, **(metadata or {}))

    def process(self, doc_id, metadata=None):
        return fc.L(self.stream(doc_id, metadata=metadata))
doc_id, doc_type, doc_url, doc_title, doc_cite, doc_year = fc.getattrs(metadata.iloc[0], 'id', 'type', 'url', 'title', 'citation', 'year')
doc_proc = DocumentProcessor()
extras = {'type': doc_type, 'url': doc_url, 'title': doc_title, 'citation': doc_cite, 'year': doc_year}
chunks = fc.L(doc_proc.process(doc_id, metadata=extras))
chunks[0]
chunk_id: 0
doc_id: amazon2023
text: Amazon 
Sustainability 
Report
2023 Contents
Overview
3 Introduction
4 A Letter from Our Chief 
Sust
type: report
url: https://sustainability.aboutamazon.com/2023-amazon-sustainability-report.pdf
title: 2023 Amazon Sustainability Report
citation: Amazon Staff. (2023). Amazon Sustainability Report. https://sustainability.aboutamazon.com/2023-amaz
year: 2023
def process_docs(data_path='wattbot_data'):
    metadata = pd.read_csv(fc.Path(data_path)/'metadata.csv', encoding='latin-1')
    doc_proc = DocumentProcessor()
    all_chunks = metadata.apply(lambda r: doc_proc.process(r['id'], metadata=fc.attrdict(r, 'type', 'url', 'title', 'citation', 'year')), axis=1).tolist()
    return fc.L(all_chunks).concat()
all_chunks = process_docs()
all_chunks[0]
chunk_id: 0
doc_id: amazon2023
text: Amazon 
Sustainability 
Report
2023 Contents
Overview
3 Introduction
4 A Letter from Our Chief 
Sust
type: report
url: https://sustainability.aboutamazon.com/2023-amazon-sustainability-report.pdf
title: 2023 Amazon Sustainability Report
citation: Amazon Staff. (2023). Amazon Sustainability Report. https://sustainability.aboutamazon.com/2023-amaz
year: 2023

How to save all the chunks so that I dont generate again?

🤖Reply🤖

You have several options for saving the chunks:

  1. Pickle - Fast and preserves Python objects:
fc.save_pickle('all_chunks.pkl', all_chunks)
  1. JSON - More portable and readable:
import json
with open('all_chunks.json', 'w') as f:
    json.dump([chunk.__dict__ for chunk in all_chunks], f)
  1. Parquet - Efficient for large datasets:
pd.DataFrame([chunk.__dict__ for chunk in all_chunks]).to_parquet('all_chunks.parquet')

Which format would you prefer, or would you like to know more about the tradeoffs?

How to load the chunks?

🤖Reply🤖

For each saving method, here’s how to load:

  1. Pickle:
all_chunks = fc.load_pickle('all_chunks.pkl')
  1. JSON:
with open('all_chunks.json', 'r') as f:
    all_chunks = fc.L([Chunk(**d) for d in json.load(f)])
  1. Parquet:
all_chunks = fc.L([Chunk(**row) for row in pd.read_parquet('all_chunks.parquet').to_dict('records')])

Which format did you use to save them?

Embedding

sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")
Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]Fetching 18 files:   6%|▌         | 1/18 [00:00<00:07,  2.34it/s]Fetching 18 files:  11%|█         | 2/18 [00:00<00:03,  4.22it/s]Fetching 18 files: 100%|██████████| 18/18 [00:00<00:00, 39.63it/s]Fetching 18 files: 100%|██████████| 18/18 [00:00<00:00, 26.24it/s]
class Embedder:
    def __init__(self, dclient, dmodel, smodel=None, bs=50):
        fc.store_attr()
    
    def create_dembs(self, texts):
        data = fc.mapped(lambda o: self.dclient.embeddings.create(model=self.dmodel, input=o), fc.chunked(texts, chunk_sz=self.bs)).attrgot('data')
        return data.concat().attrgot('embedding')

    def create_sembs(self, texts):
        if not self.smodel: raise ValueError("Sparse model not initialized")
        return list(self.smodel.embed(texts))
openai_cli = openai.OpenAI(base_url = "https://api.fireworks.ai/inference/v1", api_key=os.environ['FIREWORKS_API_KEY'])
embedder = Embedder(openai_cli, em, smodel=sparse_model)
embds = embedder.create_dembs(chunks.attrgot('text'))
len(embds), len(embds[0])
(322, 768)

I have an api key for gemini and want to use for embedding model with the openai spec https://ai.google.dev/gemini-api/docs/openai

🤖Reply🤖

To use Gemini’s embedding model with the OpenAI spec, you’ll need to:

  1. Change the base URL to Gemini’s endpoint
  2. Use your Gemini API key
  3. Specify a Gemini embedding model name

Here’s what to modify:

openai_cli = openai.OpenAI(
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
    api_key=os.environ['GEMINI_API_KEY']
)

For the embedding model, Gemini supports models like text-embedding-004. You’d set:

em = "text-embedding-004"

Have you already set your GEMINI_API_KEY environment variable, or would you like help with that?

I am getting info logs when calling to gemini 2025-11-29 15:22:21,210 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/openai/embeddings “HTTP/1.1 200 OK”

🤖Reply🤖

You can suppress those INFO logs by adjusting the logging level for the HTTP client. Add this at the top of your notebook:

import logging
logging.getLogger("httpx").setLevel(logging.WARNING)

This will only show warnings and errors, not info messages. Would you like to try this?

all_dense_embds = embedder.create_dembs(all_chunks.attrgot('text'))
len(all_dense_embds), len(all_dense_embds[0])
(1899, 768)
all_sparse_embds = embedder.create_sembs(all_chunks.attrgot('text'))
len(all_sparse_embds), len(all_sparse_embds[0].values)
(1899, 137)

Qdrant

QHOST = "https://76eb75ff-5ca3-4a68-93d4-ae19a9c169a4.us-west-2-0.aws.cloud.qdrant.io:6333"
api_key = os.getenv("QDRANT_API_KEY")
class QdrantStore:
    def __init__(self, url, key, collection_name, use_hybrid=False):
        fc.store_attr()
        self.cli = QdrantClient(url=url, api_key=key)
        self.points = []

    def create_or_replace_collection(self, vs, replace=False, distance=Distance.COSINE):
        if replace: self.cli.delete_collection(collection_name=self.collection_name)
        sparse_vectors_config = None
        if not self.cli.collection_exists(self.collection_name): 
            if self.use_hybrid: sparse_vectors_config={"sparse": models.SparseVectorParams()}
            self.cli.create_collection(collection_name=self.collection_name, 
                vectors_config={"dense": VectorParams(size=vs, distance=distance)}, 
                sparse_vectors_config=sparse_vectors_config)
        return self
    

    def _sv(self, semb):
        return models.SparseVector(values=semb.values.tolist(), indices=semb.indices.tolist())

    def create_hybrid_points(self, chunks, dembs, sembs, start_id=0):
        return fc.L(chunks, dembs, sembs).zip().enumerate().map(lambda ele: PointStruct(id=start_id+ele[0], vector={"dense": ele[1][1], "sparse": self._sv(ele[1][2])}, payload=ele[1][0].__dict__))

    def create_points(self, chunks, dembs, sembs=None, start_id=0):
        if self.use_hybrid and sembs: 
            self.points = self.create_hybrid_points(chunks, dembs, sembs, start_id)
        else: 
            self.points = fc.L(chunks, dembs).zip().enumerate().map(lambda ele: PointStruct(id=start_id+ele[0], vector={"dense": ele[1][1]}, payload=ele[1][0].__dict__))
        return self
    
    def upsert(self, bs=100):
        fc.mapped(lambda o: self.cli.upsert(collection_name=self.collection_name, points=list(o)), fc.chunked(self.points, chunk_sz=bs))
        return self

    def hybrid_search(self, query_demb, query_semb, limit=5):
        return self.cli.query_points(collection_name=self.collection_name,
            prefetch=[
                models.Prefetch(query=query_demb, using="dense", limit=limit*2),
                models.Prefetch(query=self._sv(query_semb), using="sparse", limit=limit*2)
            ],
            query=models.FusionQuery(fusion=models.Fusion.RRF),
            limit=limit
        )

    def search(self, query_demb, query_semb=None, limit=5):
        if self.use_hybrid and query_semb:
            results = self.hybrid_search(query_demb, query_semb, limit)
        else:
            results = self.cli.query_points(collection_name=self.collection_name, query=query_vector, limit=limit)
        return results.points

Do we really require qdrant datastore for storing 1899 embeddings and searching? Will a brute force search work which does search across all the embedding

🤖Reply🤖

Great question! For 1,899 embeddings, brute force search would absolutely work and might even be simpler and faster than using Qdrant.

With modern hardware, computing cosine similarity across ~2,000 vectors is very quick (milliseconds). Vector databases like Qdrant are mainly beneficial when you have: - Hundreds of thousands or millions of vectors - Need for persistence across sessions - Advanced filtering on metadata - Multiple concurrent users

For your use case, you could use simple numpy/scipy operations:

from sklearn.metrics.pairwise import cosine_similarity

Would you like to try implementing a simple brute force search to compare? Or are you planning to scale up significantly and want to keep Qdrant for future growth?

And what about the keyword search?

🤖Reply🤖

Good point! For keyword search (like BM25), you have a few options without Qdrant:

  1. Simple approach: Use libraries like rank-bm25 which does in-memory BM25 scoring
  2. Hybrid with numpy: Store sparse embeddings as scipy sparse matrices and compute similarities
  3. Full-text search: Use SQLite’s FTS5 for keyword search

For 1,899 documents, all of these would be fast enough. The sparse embeddings you’re generating with Qdrant/bm25 model could be stored as scipy sparse matrices and searched with simple dot products.

Are you interested in implementing a lightweight alternative, or do you prefer keeping Qdrant for the convenience of having both dense and sparse search in one place?

cname = 'wattbot_docs_nomic_768_sparse'
# cname = 'wattbot_docs_qwen_8b'
qs = QdrantStore(QHOST, api_key, cname, use_hybrid=True).create_or_replace_collection(768, replace=True)
qs.create_points(all_chunks, all_dense_embds, all_sparse_embds).upsert()
<__main__.QdrantStore>
qs.cli.get_collections()
CollectionsResponse(collections=[CollectionDescription(name='wattbot_docs_nomic_768'), CollectionDescription(name='wattbot_documents_768'), CollectionDescription(name='wattbot_docs_qwen_8b'), CollectionDescription(name='colbert_chunks'), CollectionDescription(name='wattbot_documents'), CollectionDescription(name='wattbot_docs_nomic_768_sparse')])

Instead of the qdrant database I save the dense and sparse embeddings locally and do similarity search then combine the results

🤖Reply🤖

That’s a solid approach for this dataset size! You’d essentially be building a lightweight hybrid search system.

The basic steps would be: 1. Save dense embeddings (numpy array) 2. Save sparse embeddings (scipy sparse matrix or list) 3. For search: compute similarities separately for dense and sparse 4. Combine results using a fusion method (like RRF - Reciprocal Rank Fusion)

Would you like me to help you implement this step by step, or would you prefer to try building it yourself with guidance on specific parts as needed?

Yes I would need to help with saving the sparse embedding and compute similarity for them

🤖Reply🤖

Great! Let’s start with saving the sparse embeddings.

Since your sparse embeddings have .values and .indices attributes (from the fastembed model), you have a couple of options:

  1. Scipy sparse matrix - efficient for similarity computation
  2. List of dicts - simpler, keeps the structure as-is

For scipy, you’d convert to CSR format which is efficient for similarity searches. For the list approach, you’d just pickle the embeddings directly.

Which approach sounds better to you, or would you like to try the scipy sparse matrix route since it’s more efficient for similarity computation?

What is CSR format

🤖Reply🤖

CSR stands for Compressed Sparse Row format. It’s a way to efficiently store sparse matrices (matrices with mostly zeros) by only storing the non-zero values.

Instead of storing a full matrix like:

[0, 0, 3, 0, 5]
[2, 0, 0, 0, 0]
[0, 4, 0, 1, 0]

CSR stores three arrays: - values: the non-zero values [3, 5, 2, 4, 1] - indices: which column each value is in - indptr: where each row starts

This saves memory and makes operations like dot products much faster when you have lots of zeros (which is typical for sparse embeddings like BM25).

Does this make sense? Would you like to proceed with using scipy’s CSR format for your sparse embeddings?

Okay lets do that

🤖Reply🤖

Perfect! Let’s start with converting your sparse embeddings to CSR format.

First, you’ll need scipy’s sparse module:

from scipy.sparse import csr_matrix

To convert your all_sparse_embds (which are fastembed sparse vectors) to a CSR matrix, you’ll need to extract the indices and values from each embedding and stack them into a matrix.

Try creating a function that takes one sparse embedding and returns the indices and values. What do you get when you look at all_sparse_embds[0].indices and all_sparse_embds[0].values?

I created the function def get_val_idx(s): return s.indices, s.values

🤖Reply🤖

Good start! Now you have a way to extract the indices and values from each sparse embedding.

The next step is to build the CSR matrix from all your embeddings. You’ll need to: 1. Determine the maximum index across all embeddings (this will be the number of columns) 2. Stack all the indices and values together 3. Create the CSR matrix

Try finding the maximum index first. What do you get when you check the max index across all your sparse embeddings?

THe highest value of indices are 30151

🤖Reply🤖

Great! So your CSR matrix will have 30,152 columns (since indices are 0-based).

Now you need to build the CSR matrix. Scipy’s csr_matrix can be constructed using:

csr_matrix((data, indices, indptr), shape=(n_rows, n_cols))

Where: - data = all values concatenated - indices = all column indices concatenated
- indptr = array marking where each row starts in the data array

Try creating the data and indices arrays first by concatenating all the values and indices from your sparse embeddings. What approach will you use to concatenate them?

Yes

🤖Reply🤖

Great! Now, back to building your CSR matrix - do you want to continue with concatenating the data and indices arrays, or would you like to explore something else about your RAG system first?

lets do a simple lexical search instead of the model

🤖Reply🤖

Good idea! For simple lexical search, you have a few options:

  1. BM25 library (rank-bm25) - classic keyword scoring
  2. TF-IDF with sklearn - term frequency approach
  3. SQLite FTS5 - built-in full-text search

BM25 is generally considered best for search/retrieval tasks. Would you like to try the rank-bm25 library, or explore one of the other options?

Yes

🤖Reply🤖

Perfect! First, you’ll need to install the library:

!pip install rank-bm25

Then import it:

from rank_bm25 import BM25Okapi

BM25 needs tokenized documents to build its index. You’ll need to split your chunk texts into tokens (words). How would you like to tokenize - simple split() or something more sophisticated?

When adding the package rank-bm25 I am getting the following error

No solution found when resolving dependencies for split (python_full_version == ’3.9.*’): ╰─▶ Because the requested Python version (>=3.9) does not satisfy Python>=3.10 and contextkit==0.0.10 depends on Python>=3.10, we can conclude that contextkit==0.0.10 cannot be used. And because only contextkit<=0.0.10 is available and your project depends on contextkit>=0.0.10, we can conclude that your project’s requirements are unsatisfiable.

  hint: The `requires-python` value (>=3.9) includes Python versions that are not supported by your dependencies (e.g., contextkit==0.0.10 only supports >=3.10). Consider using a more restrictive
  `requires-python` value (like >=3.10).

help: If you want to add the package regardless of the failed resolution, provide the --frozen flag to skip locking and syncing. (wattbot) (base) MacBook-Pro:wattbot anubhavmaity$

🤖Reply🤖

This error is about your Python version and project dependencies, not specifically about rank-bm25.

Your project requires contextkit>=0.0.10 which needs Python 3.10+, but your project’s requires-python is set to >=3.9.

You have two options:

  1. Update your project’s Python requirement to >=3.10 in your pyproject.toml or similar config file
  2. Use the --frozen flag as suggested: uv pip install rank-bm25 --frozen (though this skips dependency resolution)

Which Python version are you actually running? You can check with python --version.

okay I was able to install rank-bm25, now what is next

🤖Reply🤖

Great! Now you need to tokenize your chunk texts to build the BM25 index.

Try creating a list of tokenized documents from your chunks. For each chunk’s text, split it into words. What method will you use - simple split() or something else?

simple split will do

🤖Reply🤖

Perfect! Try creating the tokenized corpus:

tokenized_corpus = [chunk.text.lower().split() for chunk in all_chunks]

The .lower() helps with case-insensitive matching. Once you have this, you can create the BM25 index. Give it a try and let me know what you get!

got the tokenized corpus what is next?

🤖Reply🤖

Excellent! Now create the BM25 index:

bm25 = BM25Okapi(tokenized_corpus)

This builds the index from your tokenized documents. Once you have this, you’ll be able to search by tokenizing a query and using bm25.get_scores() or bm25.get_top_n().

Have you created the index?

What should I pass to get_top_n

🤖Reply🤖

get_top_n() takes three main arguments:

  1. tokenized_query - your search query split into tokens (like you did for the corpus)
  2. documents - the original documents/chunks you want returned
  3. n - how many top results you want

So it would look something like:

bm25.get_top_n(tokenized_query, all_chunks, n=5)

What query would you like to test with?

How should I create a new class to have bm25 and do the cosine similarity across all vectos?

🤖Reply🤖

Good thinking to organize this into a class! Here’s a structure to consider:

class HybridSearch:
    def __init__(self, chunks, dense_embds, tokenized_corpus):
        # Store your data
        # Initialize BM25
        # Store dense embeddings as numpy array

You’ll need methods like: - lexical_search(query, top_k) - for BM25 - dense_search(query_emb, top_k) - for cosine similarity - hybrid_search(query, query_emb, top_k) - combines both

For cosine similarity, you can use sklearn.metrics.pairwise.cosine_similarity or just normalize vectors and use dot product.

Would you like to start building this class yourself, or would you like me to write it out for you?

Can you please show me how to do the cosine similarity?

🤖Reply🤖

Sure! Here are two approaches for cosine similarity:

Option 1 - Using sklearn:

from sklearn.metrics.pairwise import cosine_similarity
scores = cosine_similarity([query_emb], dense_embds)[0]

Option 2 - Using numpy (faster for normalized vectors):

import numpy as np
# Normalize vectors first
dense_embds_norm = dense_embds / np.linalg.norm(dense_embds, axis=1, keepdims=True)
query_norm = query_emb / np.linalg.norm(query_emb)
scores = dense_embds_norm @ query_norm

Both give you similarity scores for each document. To get top-k results, you’d use np.argsort(scores)[-k:][::-1] to get the indices.

Which approach would you like to use in your class?

Retriever

class Retriever:
    def __init__(self, embedder, vector_store):
        fc.store_attr()
    
    def retrieve(self, query, limit=5):
        query_demb = self.embedder.create_dembs(texts=[query])[0]
        query_semb = self.embedder.create_sembs(texts=[query])[0] if self.embedder.smodel else None
        return self.vector_store.search(query_demb, query_semb, limit=limit)
query_text = "What are Amazon's carbon emissions goals?"
retriever = Retriever(embedder, qs)
retriever.retrieve(query_text, limit=1)[0].payload['text'][:200]
'to build Climate Pledge Arena, the first International \nLiving Future Institute zero-carbon certified sports and \nentertainment venue in the world.\nCarbon \nThe science is clear. Significant carbon emi'

To support creating the embeddings for both sparse and dense.

🤖Reply🤖

You need to update the Retriever class to handle both dense and sparse embeddings. The error shows it’s calling create_embeddings which doesn’t exist - your Embedder has create_dembs and create_sembs instead.

Here’s how you can modify the retrieve method to work with your hybrid setup:

def retrieve(self, query, limit=5):
    query_demb = self.embedder.create_dembs(texts=[query])[0]
    query_semb = self.embedder.create_sembs(texts=[query])[0] if self.embedder.smodel else None
    # Now you need to decide how to combine them for search

However, I notice your QdrantStore.search method currently only takes a single query_vector. For hybrid search with both dense and sparse vectors, you’ll need to update that method too.

Would you like help modifying the search method to support hybrid queries, or do you want to work on that yourself?

Yes

🤖Reply🤖

Great! For hybrid search in Qdrant, you need to use the query_points method with a prefetch parameter to combine dense and sparse searches.

Here’s the key structure:

def search(self, query_demb, query_semb=None, limit=5):
    if self.use_hybrid and query_semb:
        # Use prefetch for hybrid search
        results = self.cli.query_points(
            collection_name=self.collection_name,
            prefetch=[
                models.Prefetch(query=query_demb, using="dense", limit=limit*2),
                models.Prefetch(query=self._sv(query_semb), using="sparse", limit=limit*2)
            ],
            query=models.FusionQuery(fusion=models.Fusion.RRF),
            limit=limit
        )

The prefetch retrieves candidates from both dense and sparse vectors, then FusionQuery with RRF (Reciprocal Rank Fusion) combines the results.

Would you like to complete the rest of the method to handle the non-hybrid case?

query_text = "What dataset of 5,842 labeled entries was used to test energy-efficient large language models in the financial domain?"
retriever.retrieve(query_text, limit=3)[2]
ScoredPoint(id=1864, version=18, score=0.33333334, payload={'chunk_id': 8400, 'doc_id': 'zschache2025', 'text': ' on a single\nbenchmark, we extend these efforts to a greater number of partially very large models\ndeployed to a practical inference scenario.\nThis makes our study very similar to the one by Alizadeh et al. (2025), who inves-\ntigated the trade-offs between accuracy and energy consumption when deploying large\nlanguage models (LLMs) for software development tasks. Besides the finding that\nlarger LLMs with higher energy consumption do not always yield significantly bet-\nter accuracy, the authors demonstrated that architectural factors, such as feedforward\nlayer size and transformer block count, directly correlate with energy usage.\n3 Finally, Luccioni et al. (2024) provide one of the most comprehensive analyses\nof energy consumption during ML model inference. Their study systematically com-\npared the energy costs of 88 models across 10 tasks and 30 datasets, including both\nsmaller task-specific and larger multi-purpose models. They found that the larger\nmodels are orders of magnitude more energy-intensive than smaller task-specific ones,\nespecially for tasks involving text and image generation. Furthermore, their research\nunderscores the variability in energy consumption across tasks and model architec-\ntures. The authors advocate for increased transparency and sustainable deployment\npractices, emphasizing that the environmental costs of deploying large, multi-purpose\nAI systems must be carefully weighed against their utility.\n3 Data and methods\nOur experiments are inspired by a', 'type': 'paper', 'url': 'https://arxiv.org/pdf/2508.14170 ', 'title': 'Comparing energy consumption and accuracy in text classification inference', 'citation': 'Johannes Zschache, & Tilman Hartwig (2025). Comparing energy consumption and accuracy in text classification inference arXiv. https://arxiv.org/pdf/2508.14170', 'year': 2025}, vector=None, shard_key=None, order_value=None)

How to remove the stored_args from the ScoredPoint above?

🤖Reply🤖

The __stored_args__ is being added to the payload because it’s part of the Chunk object’s __dict__. You can filter it out when creating the points.

In your QdrantStore.create_points method, modify the payload creation to exclude it:

payload={k: v for k, v in ele[1][0].__dict__.items() if k != '__stored_args__'}

This filters out __stored_args__ before storing the payload. Would you like to update both the create_points and create_hybrid_points methods with this change?

Cant we modify the Chunk class to not store the stored_args?

🤖Reply🤖

Yes, absolutely! That’s a cleaner solution. The __stored_args__ comes from fastcore’s store_attr() function. You can either:

  1. Override __dict__ to exclude it
  2. Delete it after calling store_attr()
  3. Use a property/method to return a clean dict

The simplest approach would be to add this line in your Chunk.__init__:

del self.__stored_args__

Would you like to try that, or would you prefer a different approach?

I want to search for the text “5,842 labeled entries” in all_chunks

🤖Reply🤖

You can search for that text using a list comprehension or the filter method. Here’s a simple approach:

matching_chunks = [chunk for chunk in all_chunks if '5,842 labeled entries' in chunk.text]

This will give you all chunks that contain that exact phrase. Would you like to try this and see what you find?

khan2025_chunks = DocumentProcessor().process('khan2025')
[chunk for chunk in khan2025_chunks if '5,842 entries' in chunk.text]
[chunk_id: 18200
 doc_id: khan2025
 text: sed language model
 tuned for general-purpose tasks.
 • Mistral-7B-Instruct-v0.3: An instruction-tuned]

It does not return the result with 5,482 labeled entries. How to make sure it returns that?

🤖Reply🤖

The issue is that you’re searching for the exact phrase ‘5,842 entries’ but the actual text might have different formatting - perhaps it says ‘5,842 labeled entries’ or has extra spaces or punctuation.

Try searching with just the number:

[chunk for chunk in khan2025_chunks if '5,842' in chunk.text]

This will find any chunk containing that number, regardless of the surrounding words. Does that return the result you’re looking for?

[chunk for chunk in khan2025_chunks if '5,842' in chunk.text]
[chunk_id: 18200
 doc_id: khan2025
 text: sed language model
 tuned for general-purpose tasks.
 • Mistral-7B-Instruct-v0.3: An instruction-tuned]
# [chunk for chunk in all_chunks if 'JetMoE-8B' in chunk.text]

Why the retrieve function is returning the result as 2nd index instead of first?

🤖Reply🤖

Good question! There are a few possible reasons why the most relevant chunk isn’t ranking first:

  1. Embedding similarity - The query embedding might be more similar to other chunks based on semantic meaning, not just keyword matching
  2. Hybrid search weighting - The RRF (Reciprocal Rank Fusion) is combining dense and sparse results, and the sparse (BM25) scoring might be favoring other chunks
  3. Context around the keywords - Other chunks might have more context that semantically matches your query

To debug this, you could: - Check the scores of the top 3 results to see how close they are - Try adjusting the fusion weights or prefetch limits in your hybrid search - Compare dense-only vs sparse-only vs hybrid results

Would you like to investigate any of these possibilities?

Generator

class Generator:
    def __init__(self, llm_client, prompt_template):
        fc.store_attr()
    
    def create_context(self, retrieved_chunks):
        return "\n\n".join([f"Doc ID: {chunk.payload['doc_id']}\nURL: {chunk.payload['url']}\nTitle: {chunk.payload['title']}\nText: {chunk.payload['text']}" for chunk in retrieved_chunks])
    
    def format_prompt(self, query, context):
        return self.prompt_template.build(context=context, question=query)
    
    def generate(self, query, retrieved_chunks):
        context = self.create_context(retrieved_chunks)
        formatted_prompt = self.format_prompt(query, context)
        response = self.llm_client.chat.completions.create(formatted_prompt['messages'], response_format={"type": "json_object"})
        time.sleep(1)
        return fc.loads(response.choices[0].message.content)
# llm = fw.LLM(model="accounts/fireworks/models/deepseek-v3p1",  deployment_type="auto")
# llm = fw.LLM(model="accounts/fireworks/models/deepseek-r1-0528",  deployment_type="auto")
# model = 'accounts/fireworks/models/kimi-k2-instruct-0905'
# model = 'accounts/fireworks/models/llama-v3p1-70b-instruct'
model = 'accounts/fireworks/models/deepseek-v3p1'

# model = 'accounts/fireworks/models/kimi-k2-thinking'

llm = fw.LLM(model=model,  deployment_type="auto")

I want to use a gemini model with openai spec

🤖Reply🤖

You can use Gemini models with the OpenAI spec by creating an OpenAI client with Gemini’s base URL, similar to what you did for embeddings earlier.

For the Generator class, you’d pass in an OpenAI client configured for Gemini:

gemini_client = openai.OpenAI(
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
    api_key=os.environ['GEMINI_API_KEY']
)

Then for the model name, you’d use something like "gemini-1.5-pro" or "gemini-1.5-flash".

Have you already set up your Gemini API key, or would you like help with that part?

How to pass the model name?

🤖Reply🤖

When you create the chat completion in your Generator.generate method, you pass the model name to the create call:

response = self.llm_client.chat.completions.create(
    model="gemini-1.5-pro",  # or "gemini-1.5-flash"
    messages=formatted_prompt['messages'],
    response_format={"type": "json_object"}
)

Note: Check if Gemini supports the response_format parameter - not all models/providers support structured JSON output this way.

Would you like to test this with your Gemini client?

How to enable the thinking model for kimi-k2-thinking model? And is there a prefill the model with {

🤖Reply🤖

For thinking models like kimi-k2-thinking, you typically need to:

  1. Enable thinking mode - Check if there’s a parameter like enable_thinking=True or similar in the Fireworks API
  2. Prefill with { - Yes, some models support prefilling the assistant’s response to force JSON output

For prefilling, you’d add an assistant message with just { to force JSON formatting:

messages = [
    {"role": "user", "content": "your prompt"},
    {"role": "assistant", "content": "{", "prefix": True}
]

The exact parameter name might vary. Have you checked the Fireworks documentation for kimi-k2-thinking specific parameters? Would you like me to help you look that up?

Yes please

🤖Reply🤖
{
  "id": "toolu_01QueTq9m7VMedbMSWThKkaY",
  "call": {
    "function": "read_url",
    "arguments": {
      "url": "https://docs.fireworks.ai/guides/querying-text-models"
    }
  },
  "result": "'Skip to main content\\n\\n[Fireworks AI Docs home page![light logo](https://mintcdn.com/fireworksai/sTHhFfY93wc80BaS/logo/logo.svg?fit=max&auto=format&n=sTHhFfY93wc80BaS&q=85&s=33df07dcf4adc8ba74a3d9c6e5153b3a)![dark logo](https://mintcdn.com/fireworksai/sTHhFfY93wc80BaS/logo/logo.svg?fit=max&auto=format&n=sTHhFfY93wc80BaS&q=85&s=33df07dcf4adc8ba74a3d9c6e5153b3a)](/)\\n\\n[Documentation](/getting-started/introduction)[API & SDK Reference](/api-reference/introduction)[CLI Reference](/tools-sdks/firectl/firectl)[Demos](https://demos.fireworks.ai)[Resources](/getting-started/concepts)\\n\\n  * [Community](https://discord.gg/fireworks-ai)\\n  * [Status](https://status.fireworks.ai/)\\n  * [Dashboard](https://app.fireworks.ai/login)\\n  * [Dashboard](https://app.fireworks.ai/login)\\n\\nSearch...\\n\\nNavigation\\n\\nModels & Inference\\n\\nText Models\\n\\nSearch...\\n\\n\u2318K\\n\\n##### Get Started\\n\\n  * [Overview](/getting-started/introduction)\\n  * [Serverless Quickstart](/getting-started/quickstart)\\n  * [Deployments Quickstart](/getting-started/ondemand-quickstart)\\n  * [Which model should I use?](/guides/recommended-models)\\n\\n##### Deployments\\n\\n  * [Deployments Overview](/guides/ondemand-deployments)\\n  * [Autoscaling](/deployments/autoscaling)\\n  * [Custom Models](/models/uploading-custom-models)\\n  * [Quantization](/models/quantization)\\n  * Advanced\\n\\n##### Models & Inference\\n\\n  * [Text Models](/guides/querying-text-models)\\n  * [Vision Models](/guides/querying-vision-language-models)\\n  * [Speech to Text Models](/guides/querying-asr-models)\\n  * [Embeddings & Reranking](/guides/querying-embeddings-models)\\n  * [Tool Calling](/guides/function-calling)\\n  * [Batch API](/guides/batch-inference)\\n  * Advanced\\n\\n##### Fine Tuning\\n\\n  * [Fine Tuning Overview](/fine-tuning/finetuning-intro)\\n  * Supervised Fine Tuning\\n\\n  * Reinforcement Fine Tuning\\n\\n  * [Deploying Fine Tuned Models](/fine-tuning/deploying-loras)\\n\\n##### Administration\\n\\n  * [Managing users](/accounts/users)\\n  * [Custom SSO](/accounts/sso)\\n  * [Service Accounts](/accounts/service-accounts)\\n  * [Rate Limits & Quotas](/guides/quotas_usage/rate-limits)\\n  * [Exporting Billing Metrics](/accounts/exporting-billing-metrics)\\n\\n##### Security & Compliance\\n\\n  * [Data Security & Privacy](/guides/security_compliance/data_security)\\n  * [Zero Data Retention](/guides/security_compliance/data_handling)\\n  * [Audit & Access Logs](/guides/security_compliance/audit_logs)\\n\\n##### Integrations\\n\\n  * [Cloud Integrations](/ecosystem/integrations)\\n  * [Agent Frameworks](/ecosystem/integrations/agent-frameworks)\\n  * [MLOps & Observability](/ecosystem/integrations/mlops-observability)\\n\\nOn this page\\n\\n  * Chat Completions API\\n  * Alternative query methods\\n  * Querying dedicated deployments\\n  * Common patterns\\n  * Multi-turn conversations\\n  * System prompts\\n  * Streaming responses\\n  * Async requests\\n  * Usage & performance tracking\\n  * Advanced capabilities\\n  * Configuration & debugging\\n  * Understanding tokens\\n  * OpenAI SDK Migration\\n  * Next steps\\n\\nModels & Inference\\n\\n# Text Models\\n\\nCopy page\\n\\nQuery, track and manage inference for text models\\n\\nCopy page\\n\\nNew to Fireworks? Start with the [Serverless Quickstart](/getting-started/quickstart) for a step-by-step guide to making your first API call.\\n\\nFireworks provides fast, cost-effective access to leading open-source text models through OpenAI-compatible APIs. Query models via serverless inference or dedicated deployments using the chat completions API (recommended), completions API, or responses API. [Browse 100+ available models \u2192](https://fireworks.ai/models)\\n\\n##\\n\\n\\u200b\\n\\nChat Completions API\\n\\n  * Python\\n\\n  * JavaScript\\n\\n  * curl\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nimport os\\nfrom openai import OpenAI\\n\\nclient = OpenAI(\\n    api_key=os.environ.get(\"FIREWORKS_API_KEY\"),\\n    base_url=\"https://api.fireworks.ai/inference/v1\"\\n)\\n\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\\n        \"role\": \"user\",\\n        \"content\": \"Explain quantum computing in simple terms\"\\n    }]\\n)\\n\\nprint(response.choices[0].message.content)\\n\\n```\\n\\nMost models automatically format your messages with the correct template. To verify the exact prompt used, enable the `echo` parameter.\\n\\n##\\n\\n\\u200b\\n\\nAlternative query methods\\n\\nFireworks also supports [Completions API](/guides/completions-api) and [Responses API](/guides/response-api).\\n\\n##\\n\\n\\u200b\\n\\nQuerying dedicated deployments\\n\\nFor consistent performance, guaranteed capacity, or higher throughput, you can query [on-demand deployments](/guides/ondemand-deployments) instead of serverless models. Deployments use the same APIs with a deployment-specific model identifier:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\n<MODEL_NAME>#<DEPLOYMENT_NAME>\\n\\n```\\n\\nFor example:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1#accounts/<ACCOUNT_ID>/deployments/<DEPLOYMENT_ID>\",\\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}]\\n)\\n\\n```\\n\\n##\\n\\n\\u200b\\n\\nCommon patterns\\n\\n###\\n\\n\\u200b\\n\\nMulti-turn conversations\\n\\nMaintain conversation history by including all previous messages:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nmessages = [\\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\\n    {\"role\": \"user\", \"content\": \"What\\'s the capital of France?\"},\\n    {\"role\": \"assistant\", \"content\": \"The capital of France is Paris.\"},\\n    {\"role\": \"user\", \"content\": \"What\\'s its population?\"}\\n]\\n\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=messages\\n)\\n\\nprint(response.choices[0].message.content)\\n\\n```\\n\\nThe model uses the full conversation history to provide contextually relevant responses.\\n\\n###\\n\\n\\u200b\\n\\nSystem prompts\\n\\nOverride the default system prompt by setting the first message with `role: \"system\"`:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nmessages = [\\n    {\"role\": \"system\", \"content\": \"You are a helpful Python expert who provides concise code examples.\"},\\n    {\"role\": \"user\", \"content\": \"How do I read a CSV file?\"}\\n]\\n\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=messages\\n)\\n\\n```\\n\\nTo completely omit the system prompt, set the first message\u2019s `content` to an empty string.\\n\\n###\\n\\n\\u200b\\n\\nStreaming responses\\n\\nStream tokens as they\u2019re generated for real time, interactive UX. Covered in detail in the [Serverless Quickstart](/getting-started/quickstart#streaming-responses).\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nstream = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Tell me a story\"}],\\n    stream=True\\n)\\n\\nfor chunk in stream:\\n    if chunk.choices[0].delta.content:\\n        print(chunk.choices[0].delta.content, end=\"\")\\n\\n```\\n\\n**Aborting streams:** Close the connection to stop generation and avoid billing for ungenerated tokens:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nfor chunk in stream:\\n    print(chunk.choices[0].delta.content, end=\"\")\\n    if some_condition:\\n        stream.close()\\n        break\\n\\n```\\n\\n###\\n\\n\\u200b\\n\\nAsync requests\\n\\nUse async clients to make multiple concurrent requests for better throughput:\\n\\n  * Python\\n\\n  * JavaScript\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nimport asyncio\\nfrom openai import AsyncOpenAI\\n\\nclient = AsyncOpenAI(\\n    api_key=os.environ.get(\"FIREWORKS_API_KEY\"),\\n    base_url=\"https://api.fireworks.ai/inference/v1\"\\n)\\n\\nasync def main():\\n    response = await client.chat.completions.create(\\n        model=\"accounts/fireworks/models/deepseek-v3p1\",\\n        messages=[{\"role\": \"user\", \"content\": \"Hello\"}]\\n    )\\n    print(response.choices[0].message.content)\\n\\nasyncio.run(main())\\n\\n```\\n\\n###\\n\\n\\u200b\\n\\nUsage & performance tracking\\n\\nEvery response includes token usage information and performance metrics for debugging and observability. For aggregate metrics over time, see the [usage dashboard](https://app.fireworks.ai/account/usage). **Token usage** (prompt, completion, total tokens) is included in the response body for all requests. **Performance metrics** (latency, time-to-first-token, etc.) are included in response headers for non-streaming requests. For streaming requests, use the `perf_metrics_in_response` parameter to include all metrics in the response body.\\n\\n  * Non-streaming\\n\\n  * Streaming (usage only)\\n\\n  * Streaming (with performance metrics)\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}]\\n)\\n\\n# Token usage (always included)\\nprint(response.usage.prompt_tokens)      # Tokens in your prompt\\nprint(response.usage.completion_tokens)  # Tokens generated\\nprint(response.usage.total_tokens)       # Total tokens billed\\n\\n# Performance metrics are in response headers:\\n# fireworks-prompt-tokens, fireworks-server-time-to-first-token, etc.\\n\\n```\\n\\nUsage information is automatically included in the final chunk for streaming responses (the chunk with `finish_reason` set). This is a Fireworks extension - OpenAI SDK doesn\u2019t return usage for streaming by default.\\n\\nFor all available metrics and details, see the [API reference documentation](/api-reference/post-chatcompletions#response-perf_metrics).\\n\\nIf you encounter errors during inference, see [Inference Error Codes](/guides/inference-error-codes) for common issues and resolutions.\\n\\n##\\n\\n\\u200b\\n\\nAdvanced capabilities\\n\\nExtend text models with additional features for structured outputs, tool integration, and performance optimization:\\n\\n## [Tool callingConnect models to external tools and APIs with type-safe parameters](/guides/function-calling)## [Structured outputsEnforce JSON schemas for reliable data extraction](/structured-responses/structured-response-formatting)## [Responses APIMulti-step reasoning for complex problem-solving](/guides/response-api)## [Predicted outputsSpeed up edits by predicting unchanged sections](/guides/predicted-outputs)## [Prompt cachingCache common prompts to reduce latency and cost](/guides/prompt-caching)## [Batch inferenceProcess large volumes of requests asynchronously](/guides/batch-inference)\\n\\n##\\n\\n\\u200b\\n\\nConfiguration & debugging\\n\\nSampling parameters\\n\\nControl how the model generates text. Fireworks automatically uses recommended sampling parameters from each model\u2019s HuggingFace `generation_config.json` when you don\u2019t specify them explicitly, ensuring optimal performance out-of-the-box.We pull `temperature`, `top_k`, `top_p`, `min_p`, and `typical_p` from the model\u2019s configuration when not explicitly provided.\\n\\n###\\n\\n\\u200b\\n\\nTemperature\\n\\nAdjust randomness (0 = deterministic, higher = more creative):\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Write a poem\"}],\\n    temperature=0.7  # Override model default\\n)\\n\\n```\\n\\n###\\n\\n\\u200b\\n\\nMax tokens\\n\\nControl the maximum number of tokens in the generated completion:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nmax_tokens=100  # Generate at most 100 tokens\\n\\n```\\n\\n**Important notes:**\\n\\n  * Default value is 2048 tokens if not specified\\n  * Most models support up to their full context window (e.g., 128K for DeepSeek R1)\\n  * When the limit is reached, you\u2019ll see `\"finish_reason\": \"length\"` in the response\\n\\nSet `max_tokens` appropriately for your use case to avoid truncated responses. Check the model\u2019s context window in the [Model Library](https://fireworks.ai/models).\\n\\n###\\n\\n\\u200b\\n\\nTop-p (nucleus sampling)\\n\\nConsider only the most probable tokens summing to `top_p` probability mass:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\ntop_p=0.9  # Consider top 90% probability mass\\n\\n```\\n\\n###\\n\\n\\u200b\\n\\nTop-k\\n\\nConsider only the k most probable tokens:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\ntop_k=50  # Consider top 50 tokens\\n\\n```\\n\\n###\\n\\n\\u200b\\n\\nMin-p\\n\\nExclude tokens below a probability threshold:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nmin_p=0.05  # Exclude tokens with <5% probability\\n\\n```\\n\\n###\\n\\n\\u200b\\n\\nTypical-p\\n\\nUse typical sampling to select tokens with probability close to the entropy of the distribution:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\ntypical_p=0.95  # Consider tokens with typical probability\\n\\n```\\n\\n###\\n\\n\\u200b\\n\\nRepetition penalties\\n\\nReduce repetitive text with `frequency_penalty`, `presence_penalty`, or `repetition_penalty`:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nfrequency_penalty=0.5,   # Penalize frequent tokens (OpenAI compatible)\\npresence_penalty=0.5,    # Penalize any repeated token (OpenAI compatible)\\nrepetition_penalty=1.1   # Exponential penalty from prompt + output\\n\\n```\\n\\n###\\n\\n\\u200b\\n\\nSampling options header\\n\\nThe `fireworks-sampling-options` header contains the actual default sampling parameters used for the model, including values from the model\u2019s HuggingFace `generation_config.json`:\\n\\n  * Python\\n\\n  * JavaScript\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nimport os\\nfrom openai import OpenAI\\n\\nclient = OpenAI(\\n    api_key=os.environ.get(\"FIREWORKS_API_KEY\"),\\n    base_url=\"https://api.fireworks.ai/inference/v1\"\\n)\\n\\nresponse = client.chat.completions.with_raw_response.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}]\\n)\\n\\n# Access headers from the raw response\\nsampling_options = response.headers.get(\\'fireworks-sampling-options\\')\\nprint(sampling_options)  # e.g., \\'{\"temperature\": 0.7, \"top_p\": 0.9}\\'\\n\\ncompletion = response.parse()  # get the parsed response object\\nprint(completion.choices[0].message.content)\\n\\n```\\n\\nSee the [API reference](/api-reference/post-chatcompletions) for detailed parameter descriptions.\\n\\nMultiple generations\\n\\nGenerate multiple completions in one request:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Tell me a joke\"}],\\n    n=3  # Generate 3 different jokes\\n)\\n\\nfor choice in response.choices:\\n    print(choice.message.content)\\n\\n```\\n\\nToken probabilities (logprobs)\\n\\nInspect token probabilities for debugging or analysis:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\\n    logprobs=True,\\n    top_logprobs=5  # Show top 5 alternatives per token\\n)\\n\\nfor content in response.choices[0].logprobs.content:\\n    print(f\"Token: {content.token}, Logprob: {content.logprob}\")\\n\\n```\\n\\nPrompt inspection (echo & raw_output)\\n\\nVerify how your prompt was formatted:**Echo:** Return the prompt along with the generation:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\\n    echo=True\\n)\\n\\n```\\n\\n**Raw output:** See raw token IDs and prompt fragments:\\n\\nExperimental API - may change without notice.\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\\n    raw_output=True\\n)\\n\\nprint(response.raw_output.prompt_token_ids)  # Token IDs\\nprint(response.raw_output.completion)        # Raw completion\\n\\n```\\n\\nIgnore EOS token\\n\\nForce generation to continue past the end-of-sequence token (useful for benchmarking):\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\\n    ignore_eos=True,\\n    max_tokens=100  # Will always generate exactly 100 tokens\\n)\\n\\n```\\n\\nOutput quality may degrade when ignoring EOS. This API is experimental and should not be relied upon for production use cases.\\n\\nLogit bias\\n\\nModify token probabilities to encourage or discourage specific tokens:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\\n    logit_bias={\\n        123: 10.0,   # Strongly encourage token ID 123\\n        456: -50.0   # Strongly discourage token ID 456\\n    }\\n)\\n\\n```\\n\\nMirostat sampling\\n\\nControl perplexity dynamically using the [Mirostat algorithm](https://arxiv.org/abs/2007.14966):\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nresponse = client.chat.completions.create(\\n    model=\"accounts/fireworks/models/deepseek-v3p1\",\\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\\n    mirostat_target=5.0,  # Target perplexity\\n    mirostat_lr=0.1       # Learning rate for adjustments\\n)\\n\\n```\\n\\n##\\n\\n\\u200b\\n\\nUnderstanding tokens\\n\\nLanguage models process text in chunks called **tokens**. In English, a token can be as short as one character or as long as one word. Different model families use different **tokenizers** , so the same text may translate to different token counts depending on the model. **Why tokens matter:**\\n\\n  * Models have maximum context lengths measured in tokens\\n  * Pricing is based on token usage (prompt + completion)\\n  * Token count affects response time\\n\\nFor Llama models, use [this tokenizer tool](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/) to estimate token counts. Actual usage is returned in the `usage` field of every API response.\\n\\n##\\n\\n\\u200b\\n\\nOpenAI SDK Migration\\n\\nOpenAI SDK compatibility notes\\n\\nFireworks provides an OpenAI-compatible API, making migration straightforward. However, there are some minor differences to be aware of:\\n\\n###\\n\\n\\u200b\\n\\nBehavioral differences\\n\\n**`stop` parameter:**\\n\\n  * **Fireworks** : Returns text including the stop word\\n  * **OpenAI** : Omits the stop word\\n  * _You can easily truncate it client-side if needed_\\n\\n**`max_tokens` with context limits:**\\n\\n  * **Fireworks** : Automatically adjusts `max_tokens` lower if `prompt + max_tokens` exceeds the model\u2019s context window\\n  * **OpenAI** : Returns an invalid request error\\n  * _Control this behavior with the`context_length_exceeded_behavior` parameter_\\n\\n**Streaming usage stats:**\\n\\n  * **Fireworks** : Returns `usage` field in the final chunk (where `finish_reason` is set) for both streaming and non-streaming\\n  * **OpenAI** : Only returns usage for non-streaming responses\\n\\nExample accessing streaming usage:\\n\\nCopy\\n\\nAsk AI\\n\\n```\\nfor chunk in client.chat.completions.create(stream=True, ...):\\n    if chunk.usage:  # Available in final chunk\\n        print(f\"Tokens: {chunk.usage.total_tokens}\")\\n\\n```\\n\\n###\\n\\n\\u200b\\n\\nUnsupported parameters\\n\\nThe following OpenAI parameters are not yet supported:\\n\\n  * `presence_penalty`\\n  * `frequency_penalty`\\n  * `best_of` (use `n` instead)\\n  * `logit_bias`\\n  * `functions` (deprecated - use [Tool Calling](/guides/function-calling) with the `tools` parameter instead)\\n\\nHave a use case requiring one of these? [Join our Discord](https://discord.gg/fireworks-ai) to discuss.\\n\\n##\\n\\n\\u200b\\n\\nNext steps\\n\\n## [Vision modelsProcess images alongside text](/guides/querying-vision-language-models)## [Audio modelsTranscribe and translate audio](/guides/querying-asr-models)## [EmbeddingsGenerate vector representations for search](/guides/querying-embeddings-models)## [On-demand deploymentsDeploy models on dedicated GPUs](/guides/ondemand-deployments)## [Fine-tuningCustomize models for your use case](/fine-tuning/finetuning-intro)## [Error codesTroubleshoot common inference errors](/guides/inference-error-codes)## [API ReferenceComplete API documentation](/api-reference/post-chatcompletions)\\n\\nWas this page helpful?\\n\\nYesNo\\n\\n[Exporting MetricsPrevious](/deployments/exporting-metrics)[Vision ModelsNext](/guides/querying-vision-language-models)\\n\\n\u2318I\\n\\n[Fireworks AI Docs home page![light logo](https://mintcdn.com/firew<TRUNCATED>"
}

I don’t see specific documentation for kimi-k2-thinking model parameters in that page. Let me search for more specific information about thinking models and prefilling:Let me search more specifically for prefilling and JSON formatting with thinking models:Based on my search, I don’t see specific documentation about prefilling or thinking mode parameters for the kimi-k2-thinking model on Fireworks. However, here’s what I found:

* * For JSON formatting, Fireworks supports response_format with {"type": "json_object"} to guarantee valid JSON output. * You should also instruct the model to produce JSON via a system or user message, otherwise it may generate whitespace.

For prefilling with {, this is typically an Anthropic-specific feature that may not be available in the Fireworks API. The standard approach would be:

response_format={"type": "json_object"}

As for “thinking mode” - * Kimi K2 Thinking is built as a thinking agent that reasons step-by-step, so the thinking behavior should be built-in. You likely don’t need a special parameter to enable it.

Would you like to try using response_format for JSON output instead of prefilling?

Yes how to change the generator code?

🤖Reply🤖

You’ll need to modify the generate method in your Generator class to add the response_format parameter. Here’s the key change:

response = self.llm_client.chat.completions.create(
    **formatted_prompt['messages'],
    response_format={"type": "json_object"}
)

However, I notice your current code is calling formatted_prompt['messages'] directly - can you show me what your prompt_template.build() method returns? That will help me give you the exact syntax you need.

loaded_prompt = braintrust.load_prompt(project="wattbot", slug="wattbot-prompt-v1")
qa.iloc[0]
id                                                                   q003
question                What is the name of the benchmark suite presen...
answer                                            The ML.ENERGY Benchmark
answer_value                                          ML.ENERGY Benchmark
answer_unit                                                      is_blank
ref_id                                                      ['chung2025']
ref_url                              ['https://arxiv.org/pdf/2505.06371']
supporting_materials    We present the ML.ENERGY Benchmark, a benchmar...
explanation                                                         Quote
Name: 0, dtype: object
query_text = 'What is the name of the benchmark suite presented in a recent paper for measuring inference energy consumption?'
retrieved_chunks = Retriever(embedder, qs).retrieve(query_text, limit=10)
generator = Generator(llm, loaded_prompt)
generator.generate(query_text, retrieved_chunks)
{'answer': 'ML.ENERGY Benchmark',
 'answer_value': 'ML.ENERGY Benchmark',
 'answer_unit': 'is_blank',
 'ref_id': ['chung2025'],
 'ref_url': ['https://arxiv.org/pdf/2505.06371'],
 'supporting_materials': 'We present the ML.ENERGY Benchmark, a benchmark suite and tool for measuring inference energy consumption under realistic service environments',
 'explanation': 'Quote'}

RAG

class RAG:
    def __init__(self, retriever, generator):
        fc.store_attr()
    
    def answer_question(self, query, limit=5):
        retrieved_chunks = self.retriever.retrieve(query, limit=limit)
        return self.generator.generate(query, retrieved_chunks), retrieved_chunks
    
    def predict_dataset(self, df, limit=5, n_workers=4):
        def process_row(idx_r):
            idx, r = idx_r
            answer, retrieved_chunks = self.answer_question(r['question'], limit=limit)
            if not isinstance(answer, dict): print(idx)
            return {
                'id': r['id'],
                'question': r['question'],
                'answer': answer['answer'],
                'answer_value': answer['answer_value'],
                'answer_unit': answer['answer_unit'],
                'ref_id': answer['ref_id'],
                'ref_url': answer['ref_url'],
                'supporting_materials': answer['supporting_materials'],
                'explanation': answer['explanation']
            }
    
        results = fc.parallel(process_row, df.iterrows(), threadpool=True, n_workers=n_workers)
        return pd.DataFrame(list(results))
rag = RAG(retriever, generator)
pred_answer, retrieved_chunks = rag.answer_question(query_text, limit=10); pred_answer
{'answer': 'ML.ENERGY Benchmark',
 'answer_value': 'ML.ENERGY Benchmark',
 'answer_unit': 'is_blank',
 'ref_id': ['chung2025'],
 'ref_url': ['https://arxiv.org/pdf/2505.06371'],
 'supporting_materials': 'The ML.ENERGY Benchmark: Toward Automated Inference Energy Measurement and Optimization',
 'explanation': 'Quote'}

Evaluation

def is_numeric(value):
    try:
        float(value)
        return True
    except (ValueError, TypeError):
        return False
def score_answer_value(predicted, expected):
    if expected == "is_blank":
        return 1.0 if predicted == "is_blank" else 0.0
    
    if isinstance(expected, str) and expected.startswith('['): expected = ast.literal_eval(expected)
    
    if is_numeric(expected) and is_numeric(predicted):
        pred_num, exp_num = map(float, (predicted, expected))
        return 1.0 if abs(pred_num - exp_num) <= abs(exp_num * 0.001) else 0.0
    else:
        return 1.0 if str(predicted).strip().lower() == str(expected).strip().lower() else 0.0
score_answer_value([80,90], "[80,90]")
1.0

I want to change the answer_value to evaluate the ranges properly

🤖Reply🤖

To handle range evaluation properly, you need to detect when the expected value is a range (like [80,90]) and check if the predicted value falls within that range.

Here are a few approaches:

  1. Parse ranges and check if predicted value is within bounds
  2. Handle both list formats [80,90] and string formats "[80,90]"
  3. Consider whether predicted should be a single value in the range or also a range

What format is your predicted value in - is it a single number, a list, or a string? And should it match the exact range or just fall within it?

The answer value will be a list but the dataframe can have string of list

🤖Reply🤖

Got it! So you need to:

  1. Parse the string representation of a list (like "[80,90]") into an actual list
  2. Compare two lists for equality

You can use ast.literal_eval() to safely parse the string into a list:

import ast
parsed = ast.literal_eval("[80,90]")  # Returns [80, 90]

Then compare the predicted list with the parsed expected list. Would you like to update the score_answer_value function to handle this, or would you prefer to try it yourself first?

Please update it

🤖Reply🤖

Here’s the updated function to handle list comparisons:

def score_answer_value(predicted, expected):
    if expected == "is_blank":
        return 1.0 if predicted == "is_blank" else 0.0
    
    # Handle list/range comparisons
    if isinstance(expected, str) and expected.startswith('['):
        try:
            expected = ast.literal_eval(expected)
        except (ValueError, SyntaxError):
            pass
    
    if isinstance(expected, list) and isinstance(predicted, list):
        return 1.0 if predicted == expected else 0.0
        
    if is_numeric(expected) and is_numeric(predicted):
        pred_num, exp_num = map(float, (predicted, expected))
        return 1.0 if abs(pred_num - exp_num) <= abs(exp_num * 0.001) else 0.0
    else:
        return 1.0 if str(predicted).strip().lower() == str(expected).strip().lower() else 0.0

This adds list handling before the numeric check. Try it with your test case and let me know if it works!

score_answer_value(pred_answer['answer_value'], qa.iloc[0]['answer_value'])
1.0
def score_ref_id(predicted, expected):
    if expected == "is_blank":
        return 1.0 if predicted == "is_blank" else 0.0
        
    if isinstance(expected, str) and expected.startswith('['): expected = ast.literal_eval(expected)
        
    pred_set = set(predicted) if isinstance(predicted, list) else set([predicted])
    exp_set = set(expected) if isinstance(expected, list) else set([expected])
        
    intersection = len(pred_set.intersection(exp_set))
    union = len(pred_set.union(exp_set))
        
    return intersection / union if union > 0 else 0.0
def score_is_na(predicted_answer, expected_answer):
    expected_is_na = expected_answer['answer_value'] == 'is_blank'
    predicted_is_na = predicted_answer['answer_value'] == 'is_blank'
    return 1.0 if expected_is_na == predicted_is_na else 0.0
def calculate_wattbot_score(predicted_answer, expected_row):
    if 'answer_value' not in predicted_answer: print(expected_row['id'], predicted_answer)
    answer_score = score_answer_value(predicted_answer['answer_value'], expected_row['answer_value'])
    ref_score = score_ref_id(predicted_answer['ref_id'], expected_row['ref_id'])
    na_score = score_is_na(predicted_answer, expected_row)
    score = 0.75 * answer_score + 0.15 * ref_score + 0.10 * na_score
    return fc.NS(score=score, answer_score=answer_score, ref_score=ref_score, na_score=na_score)
calculate_wattbot_score(pred_answer, qa.iloc[0])
namespace(score=1.0, answer_score=1.0, ref_score=1.0, na_score=1.0)
class Evaluator:
    def __init__(self, rag):
        fc.store_attr()
        self.results = []
    
    def evaluate_dataset(self, df, limit=5):
        experiment = braintrust.init(project="wattbot", experiment="evaluation")
        def evaluate_row(idx_r):
            idx, r = idx_r
            predicted_answer, retrieved_chunks = self.rag.answer_question(r['question'], limit=limit)
            wattbot_score = calculate_wattbot_score(predicted_answer, r)
            context = self.rag.generator.create_context(retrieved_chunks)
            prompt = self.rag.generator.format_prompt(r['question'], context)
            experiment.log(input=r['question'], output=predicted_answer, expected=r, scores=wattbot_score.__dict__, metadata = {'context': context, 'prompt': prompt})
            return fc.NS(qid=r['id'], q=r['question'], predicted=predicted_answer, expected=r, **wattbot_score.__dict__, context=context)
        
        self.results = fc.parallel(evaluate_row, df.iterrows(), threadpool=True)
        return sum(r.score for r in self.results) / len(self.results)
    
    def get_worst_predictions(self, n=5):
        sorted_results = sorted(self.results, key=lambda r: r.score)
        return sorted_results[:n]

How can I have the context and prompt logged into braintrust respecting the newlines?

🤖Reply🤖

The newlines in your context and prompt strings might not be preserved when logged to Braintrust. To ensure they’re displayed properly, you can try:

  1. Store as a list instead of a single string - Break the context into separate chunks
  2. Use explicit newline encoding - Though this usually isn’t necessary
  3. Check Braintrust’s UI settings - Some UIs have options to preserve formatting

The simplest approach is to log the context as a structured object:

metadata = {
    'context_chunks': [chunk.payload for chunk in retrieved_chunks],
    'prompt': prompt
}

Would you like to try this approach, or would you prefer to keep it as a single formatted string and investigate Braintrust’s display options?

In braintrust for some examples it is respecting the new line and for some it does not. Is there an issue on our code?

🤖Reply🤖

That’s interesting - inconsistent newline handling suggests the issue might be with how the data is being formatted before logging.

A few things to check:

  1. Are the newlines actually in the strings (\n) or are they literal line breaks?
  2. Is the context or prompt being processed differently in some cases (e.g., through JSON serialization)?
  3. Are you logging strings vs dict values differently?

Could you show me an example of one that works and one that doesn’t? Or check if the create_context method is consistently returning strings with \n characters?

Yes please

🤖Reply🤖

The core issue is that self.rag.retrieved_chunks is shared state that gets overwritten by concurrent threads. Here are two solutions:

Option 1: Make RAG return chunks along with the answer

Modify your RAG.answer_question method to return both the answer and the chunks:

def answer_question(self, query, limit=5):
    retrieved_chunks = self.retriever.retrieve(query, limit=limit)
    answer = self.generator.generate(query, retrieved_chunks)
    return answer, retrieved_chunks

Then update your evaluator to use the returned chunks instead of the shared state.

Option 2: Remove threading

Change threadpool=True to threadpool=False in your fc.parallel call. This makes execution sequential, avoiding the race condition entirely.

Which approach would you prefer? Option 1 is better if you want to keep parallel execution for speed, while Option 2 is simpler if speed isn’t critical.

Do you see any other race condition problem with the code?

🤖Reply🤖

Yes! Looking at your code more carefully, there’s another race condition in the Generator class and the braintrust.init call:

  1. Braintrust experiment object: You’re creating experiment = braintrust.init(...) once, but then multiple threads are calling experiment.log() simultaneously. This could cause issues depending on whether Braintrust’s logging is thread-safe.

  2. LLM client state: Your Generator uses self.llm_client, and if multiple threads call generate() at the same time, they’re all sharing the same client connection. This might work depending on the client implementation, but could cause problems.

Do you want to explore solutions for these, or would you like to first check if there are other issues you’ve noticed?

evals = Evaluator(rag)
evals.evaluate_dataset(df=qa, limit=10)
Skipping git metadata. This is likely because the repository has not been published to a remote yet. Remote named 'origin' didn't exist
0.7838414634146342

Why some of the outputs are empty? This is for the kimi k2 thinking model whereas the deepseek model gives proper output

🤖Reply🤖

The empty outputs ({}) from the Kimi K2 Thinking model suggest it’s not returning valid JSON in some cases. This could be due to:

  1. Thinking tokens interfering with JSON - The model might be outputting reasoning/thinking text before the JSON, breaking the json.loads() parsing
  2. Response format not properly enforced - The response_format={"type": "json_object"} might not be working as expected with this model
  3. Model-specific behavior - Thinking models sometimes structure their output differently

To debug this, you could:

  1. Print the raw response content before parsing to see what the model is actually returning
  2. Try adding error handling to catch and log failed parses
  3. Check if the prompt needs to be more explicit about JSON-only output for thinking models

Would you like to add some debugging to see what the raw responses look like?

tst_ans = rag.predict_dataset(tst)
tst_ans.isnull().sum()
id                      0
question                0
answer                  0
answer_value            0
answer_unit             0
ref_id                  0
ref_url                 0
supporting_materials    0
explanation             0
dtype: int64
tst_ans.to_csv(data_path/'tst_ans_v5.csv', index=False)

What are other ways to improve this model?

🤖Reply🤖

Great question! Here are several ways you could improve your RAG model’s performance:

Retrieval improvements: - Experiment with different chunk sizes and overlap (your current chunk_size=1500, step=1400) - Try different embedding models or ensemble multiple embeddings - Adjust the hybrid search fusion weights between dense and sparse - Increase the number of retrieved chunks and use reranking

Generation improvements: - Experiment with different LLMs or prompt engineering - Add few-shot examples to your prompt template - Implement answer verification or self-consistency checks

Evaluation-driven improvements: - Analyze your worst predictions (you already have get_worst_predictions) - Look for patterns in what types of questions fail

Which area interests you most, or would you like to start by examining your worst predictions to see where the model struggles?

evals.get_worst_predictions()
[namespace(qid='q280',
           q='Given the total pre-training GPU hours and the number of GPUs used, estimate the total wall-clock time in days required to pre-train the JetMoE-8B model.',
           predicted={'answer': 'Unanswerable from the provided context',
                      'answer_value': 'is_blank',
                      'answer_unit': 'is_blank',
                      'ref_id': ['is_blank'],
                      'ref_url': ['is_blank'],
                      'supporting_materials': 'is_blank',
                      'explanation': 'is_blank'},
           expected=id                                                                   q280
                    question                Given the total pre-training GPU hours and the...
                    answer                                                           ~13 days
                    answer_value                                                           13
                    answer_unit                                                          days
                    ref_id                                                       ['shen2024']
                    ref_url                              ['https://arxiv.org/pdf/2404.07413']
                    supporting_materials    "…30,000 H100 GPU hours… We conduct training o...
                    explanation             Math: wall_clock_hours ≈ 30,000 GPUh ÷ 96 GPUs...
                    Name: 34, dtype: object,
           score=0.0,
           answer_score=0.0,
           ref_score=0.0,
           na_score=0.0,
           context='Doc ID: strubell2019\nURL: https://arxiv.org/pdf/1906.02243\nTitle: Energy and Policy Considerations for Deep Learning in NLP\nText: day . W e train all models on\na single NVIDIA Titan X GPU, with the excep-\ntion of ELMo which was trained on 3 NVIDIA\nGTX 1080 Ti GPUs. While training, we repeat-\nedly query the NVIDIA System Management In-\nterface\n2 to sample the GPU power consumption\nand report the average over all samples. T o sample\nCPU power consumption, we use Intel’s Running\nA verage Power Limit interface.\n3\n2 nvidia-smi: https://bit.ly/30sGEbi\n3 RAPL power meter: https://bit.ly/2LObQhV\nConsumer Renew . Gas Coal Nuc.\nChina 22% 3% 65% 4%\nGermany 40% 7% 38% 13%\nUnited States 17% 35% 27% 19%\nAmazon-A WS 17% 24% 30% 26%\nGoogle 56% 14% 15% 10%\nMicrosoft 32% 23% 31% 10%\nT able 2: Percent energy sourced from: Renewable (e.g.\nhydro, solar, wind), natural gas, coal and nuclear for\nthe top 3 cloud compute providers (\nCook et al. , 2017),\ncompared to the United States, 4 China5 and Germany\n(Burger, 2019).\nW e estimate the total time expected for mod-\nels to train to completion using training times and\nhardware reported in the original papers. W e then\ncalculate the power consumption in kilowatt-hours\n(kWh) as follows. Let pc be the average power\ndraw (in watts) from all CPU sockets during train-\ning, let pr be the average power draw from all\nDRAM (main memory) sockets, let pg be the aver-\nage power draw of a GPU during training, and let\ng be the number of GPUs used to train. W e esti-\nmate total power consumption as combined GPU,\nCPU and DRAM consumption, then multiply this\nby Power Usage Effectiveness (PUE), whi\n\nDoc ID: dodge2022\nURL: https://arxiv.org/pdf/2206.05229\nTitle: Measuring the Carbon Intensity of AI in Cloud Instances\nText: 2, June 21–24, 2022, Seoul, Republic of Korea\nModel BERT BERT 6B Dense Dense Dense ViT ViT ViT ViT ViT\nfinetune pretrain Transf. 121 169 201 Tiny Small Base Large Huge\nGPU 4·V100 8·V100 256·A100 1·P40 1·P40 1·P40 1·V100 1·V100 1·V100 4·V100 4·V100\nHours 6 36 192 0.3 0.3 0.4 19 19 21 90 216\nkWh 3.1 37.3 13,812.4 0.02 0.03 0.04 1.7 2.2 4.7 93.3 237.6\nTable 2. For the 11 models in our analysis: the type of GPU, the number of GPUs of that type, the number of hours, and the energy\nused in kWh. For example, our BERT language modeling (BERT LM) experiment used 8 V100 GPUs for 36 hours and used a total of\n37.3 kWh. We note our training run of the 6 billion parameter transformer only trained for approximately 13% of the time it would\ntake to train to completion, we estimate a full training run would consume approximately 103,593 kWh.\n4.1 NLP\nBERT Training. We monitored the energy consumption while training a BERT-small model [8] for approximately 36\nhours on 8 NVIDIA V100 GPUs. That training run consumed over 37 kWh of electricity.\nBERT Finetuning. We tracked the energy consumption while finetuning the BERT-small model on a standard natural\nlanguage inference task [48, MNLI] for approximately 6 hours on 4 NVIDIA V100 GPUs. Our finetuning run consumed\naround 3.2 kWh of electricity, i.e., less than one tenth that due to BERT-small pre-training.\n6 Billion Parameter Transformer. We tracked the energy consumption of training a large language model comprising\nover 6.1 billion parameters dur\n\nDoc ID: strubell2019\nURL: https://arxiv.org/pdf/1906.02243\nTitle: Energy and Policy Considerations for Deep Learning in NLP\nText: n, an increase of just 0.1\nBLEU at the cost of at least $150k in on-demand\ncompute time and non-trivial carbon emissions.\n4.2 Cost of development: Case study\nT o quantify the computational requirements of\nR&D for a new model we study the logs of\nall training required to develop Linguistically-\nInformed Self-Attention (\nStrubell et al. , 2018), a\nmulti-task model that performs part-of-speech tag-\nging, labeled dependency parsing, predicate detec-\ntion and semantic role labeling. This model makes\nfor an interesting case study as a representative\nNLP pipeline and as a Best Long Paper at EMNLP .\nModel training associated with the project\nspanned a period of 172 days (approx. 6 months).\nDuring that time 123 small hyperparameter grid\nsearches were performed, resulting in 4789 jobs\nin total. Jobs varied in length ranging from a min-\nimum of 3 minutes, indicating a crash, to a maxi-\nmum of 9 days, with an average job length of 52\nhours. All training was done on a combination of\nNVIDIA Titan X (72%) and M40 (28%) GPUs.\n8\nThe sum GPU time required for the project\ntotaled 9998 days (27 years). This averages to\n8 W e approximate cloud compute cost using P100 pricing.\nEstimated cost (USD)\nModels Hours Cloud compute Electricity\n1 120 $52–$175 $5\n24 2880 $1238–$4205 $118\n4789 239,942 $103k–$350k $9870\nT able 4: Estimated cost in terms of cloud compute and\nelectricity for training: (1) a single model (2) a single\ntune and (3) all models trained during R&D.\nabout 60 GPUs running constantly th\n\nDoc ID: luccioni2023\nURL: https://arxiv.org/pdf/2302.08476\nTitle: Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning\nText: e to ML model are rising.\nIn Section 4, we have discussed that the main sources of variance in the amount of emissions associated to training\nmachine learning models is due to the carbon intensity of the primary energy source and the training time, with the\npower consumption of the hardware having a smaller influence. In terms of training time, the models in our sample\nrange from just about 15 minutes (total GPU/TPU time) up to more than 400,000 hours, with a median of 72 hours,\npointing again to large variance in our sample. While the maximum of of 400,000 GPU hours (equivalent to about 170\ndays with 100 GPUs) in our sample seems very large, note that the total training time of GPT-3 was estimated to be\nover 3.5 million hours (14.8 days with 10,000 GPUs) [38]. Obviously, such long training times result in large amounts of\ncarbon emissions, even with lower carbon intensity energy sources. By way of illustration, the model with the longest\ntraining time in our sample would have reduced by about 30 times the carbon emissions had it used the grid with the\nlowest carbon intensity in our sample, but it would have still resulted in over 1 ton of CO2eq. Also, generally speaking,\nwe can see that the models at the higher end of the emissions spectrum tend to be Transformer-based model with more\nlayers (as well as using techniques such as Neural Architecture Search to find optimal combinations of parameters),\nwhereas simpler and shallower models such as convolutional neural networks te\n\nDoc ID: cottier2024\nURL: https://arxiv.org/pdf/2405.21015\nTitle: The Rising Costs of Training Frontier AI Models\nText: of chips ×\n\x10\n1 − exp\n\x02\n− Training time × r ln 10\n\x03\x11\nwhere training time is in years. However, we could estimate chip-hours more often and more reliably than the training\ntime or the number of chips separately. This is because chip-hours can also be estimated from training compute in\nFLOP divided by the FLOP/s achieved during training. We used a linear approximation to take advantage of these\nchip-hour estimates:\nAmortized training cost = Start value per chip × Training chip-hours\n(365 × 24) hours/year × r ln 10\nThis approximation is valid if(Training time)×r ln 10is small, and this is the case for the training times in our data and\nour choice of r = 0.14. In an extreme case, a training time of 1 year results in 1 × 0.14 ln(10)∼= 32%deprecation\ncompared to 1 − exp(−1 × 0.14 ln(10))∼= 28%depreciation. This is not a large difference relative to other sources\nof uncertainty.\nDue to NVIDIA covering defects and component failures under warranty, we concluded that hardware failures are not a\nsignificant source of depreciation relative to hardware progress. As one data point, an average of 1 to 2 failures per\nweek occurred when training the BLOOM model on a cluster of 384 NVIDIA A100 GPUs [25]. Even if these were all\ncatastrophic failures, the expected hardware lifetime would be 3.7 years. We expect that NVIDIA replaces or repairs\ndefective GPUs on a faster timescale, which makes the cost of failure small compared to hardware price depreciation.\nA.4 Energy cost estimation\nTo model th\n\nDoc ID: luccioni2023\nURL: https://arxiv.org/pdf/2302.08476\nTitle: Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning\nText:  [38] and it remains a fair approximation of the\nactual energy consumption of many hardware models. We provide more information about TDP and the hardware used\nfor training the models in our sample in Section A.2 of the Appendix.\nTraining Time. Training time was computed as the total number of hardware hours, which is different from the\n"wall time" of ML model training, since most models were trained on multiple units at once. For instance, if training a\n1For instance, methane is 28 times more potent than CO 2 based on its 100-year global warming potential, so energy generation emitting 1 gram of\nmethane per kWh will emit 28 grams of CO2eq per kWh.\nManuscript pending review Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning 5\nmodel used 16 GPUs for 24 hours, this equals a training time of 384 GPU hours ; a model using 8 GPUs for 48 hours will\ntherefore have an equivalent training time.\n4 DATA ANALYSIS\nIn the sections below, we present several aspects regarding the carbon footprint of training ML models, examining the\nmain sources of energy used for training (§ 4.1), the order of magnitude of CO2 emissions produced (§ 4.2), the evolution\nof these emissions over time (§ 4.3) and the relationship between carbon emissions and model performance (§ 4.4) 2.\n4.1 What are the main sources of energy used for training ML models?\nThe primary energy source used for powering an electricity grid is the single biggest influence on the carbon intensity\nof that \n\nDoc ID: cottier2024\nURL: https://arxiv.org/pdf/2405.21015\nTitle: The Rising Costs of Training Frontier AI Models\nText:  of about $5,000.\n14 A.3 Amortization model\nAs explained in section 2.2, we estimated the value of the training hardware at the beginning of training as:\nStart value per chip = Acquisition cost per chip\nexp\n\x10\x02\nTraining start date − Hardware availability date\n\x03\n· r ln 10\n\x11\nwhere r is a depreciation rate in orders of magnitude per year, and the difference in dates is in years. The hardware\navailability date depended on the type of hardware. If the hardware was a Google TPU, we used the hardware\nannouncement date. For GPUs, we used a 90-day buffer between the GPU first going on the market and the GPU\nactually being shipped to the buyer. Our results are robust to variations in this buffer time—see Appendix B.4.\nFor the training start date, there were a few known cases—for example, GPT-4 finished training in August 2022 [12].\nOtherwise, we subtracted the training time from the publication date, and then subtracted a further 60 days to account\nfor time spent evaluating the model and writing the paper. Again, our results are robust to variations in this buffer. If the\ntraining time was unknown, we used the median of known values in our dataset, which was approximately 33 days.\nThe precise way to amortize the training cost through exponential depreciation is:\nAmortized training cost = Start value per chip × Number of chips × Depreciation during training\n= Start value per chip × Number of chips ×\n\x10\n1 − exp\n\x02\n− Training time × r ln 10\n\x03\x11\nwhere training time is in years. However, we cou\n\nDoc ID: wu2021a\nURL: https://arxiv.org/pdf/2111.00364\nTitle: Sustainable AI: Environmental Implications, Challenges and Opportunities\nText:  large collection of diverse ML\nideas are explored simultaneously at-scale. Thus, during this\nphase, we observe unique system resource requirements from\nthe large pool of training experiments. Within Facebook’s ML\nresearch cluster, 50% (p50) of ML training experiments take up\nto 1.5 GPU days while 99% (p99) of the experiments complete\nwithin 24 GPU days. There are a number of large-scale, trillion\nparameter models which require over 500 GPUs days.\nOnce a ML solution is determined as promising, it moves into\nTraining where the ML solution is evaluated using extensive\nproduction data — data that is more recent, is larger in quantity,\nand contains richer features . The process often requires\nadditional hyper-parameter tuning. Depending on the ML task\nrequirement, the models can be trained/re-trained at different\nfrequencies. For example, models supporting Facebook’sSearch\nservice were trained at an hourly cadence whereas the Language\nTranslation models were trained weekly [24]. A p50 production\nmodel training workflow takes 2.96 GPU days while a training\nworkflow at p99 can take up to 125 GPU days.\nFinally, for Inference, the best-performing model is de-\nployed, producing trillions of daily predictions to serve billions\nof users worldwide. The total compute cycles for inference\npredictions are expected to exceed the corresponding training\ncycles for the deployed model.\nB. Machine Learning System Life Cycle\nLife Cycle Analysis (LCA) is a common methodology to\nassess the carbon emis\n\nDoc ID: patterson2021\nURL: https://arxiv.org/pdf/2104.10350\nTitle: Carbon Emissions and Large Neural Network Training\nText: takes   ~14.8   days   for   10,000   \nGPUs   at   24.6   TeraFLOPS/sec   to   compute   3.14E+23   FLOPS.   For   the   CO 2 e   calculation,   it   doesn’t   \nactually   matter   whether   it   takes   2   weeks   on   10,000   GPUs   or   20   weeks   on   1,000   GPUs,   but   we   need   \none   number   for   Table   4,   so   we   used   NVIDIA’s   suggestion   of   10,000   GPUs.   \n● Total   Computation   (Table   1,   row   13;   Table   4,   row   16):    We   calculate   from   measured   performance,   \nnumber   of   chips,   and   days   to   train   (except   for   GPT-3,   as   OpenAI   published   the   total   FLOPS).   \n● %   of   Google   2019   Energy   Consumption.   (Table   4,   row   17):    For   all   models   (even   those   not   actually   run   \nin   Google   datacenters   or   not   run   in   2019),   we   calculate   the   percentage   of   Google’s   total   energy   \nconsumption   of   12.2   Terawatt-hours   in   2019   [Goo20].     \n● Ratio   of   round   trips   (Table   4,   row   22) .   To   give   perspective   on   the   CO 2 e   cost   of   training   a   model   is   \ncompared   to   other   activities,   we   show   the   CO 2 e   of   passenger   jets.    Google   Flights    calculated   the   \naverage   CO 2    emission   for   all   the   direct   flights   between   San   Francisco   (SFO)   and   New   York   (JFK)   in   its   \ndatabase   as   90.2t,   so   the   average   round   trip   is   180.4t.   (This   is   for   the\n\nDoc ID: dodge2022\nURL: https://arxiv.org/pdf/2206.05229\nTitle: Measuring the Carbon Intensity of AI in Cloud Instances\nText: the energy consumption of training a large language model comprising\nover 6.1 billion parameters during 8 days on 256 NVIDIA A100s. The total energy amounted to a staggering 13.8 MWh.\nThis model was not trained to completion, but only until 13%; a full training run would take 60 days. Thus, we estimate\nthe total energy consumption to train this model to completion would be approximately (60/8)∗13.8 = 103.5 MWh, or\n103,500 kWh — almost 2800 times more than training the BERT-small model!\n4.2 Computer Vision\nDenseNets. We trained three sizes of DenseNets [19] on MNIST [25]. The jobs lasted between 20 and 25 minutes and\nconsumed between 20 and 38Wh (or 0.02 to 0.04 kWh) of electricity, which is negligible compared to the other models.\nVision Transformers. We evaluated the energy consumption during the training of five sizes of Vision Transformers [9]\non ImageNet [7]. For the smallest ViT experiment (ViT tiny), training lasted around 19 hours on a single V100 and\nconsumed approximately 1.7 kWh. For the largest one (ViT huge), training lasted more than 9 days on a 4 V100s and\nconsumed approximately 237 kWh. The full list of models can be found in Table 2.\n5 EMISSIONS BY REGION AND TIME OF DAY\nUsing the methodology presented above, we provide some of the first measurements of the differences of actual\ndatacenters from a major cloud provider. Importantly, what we have is a time series of marginal emissions: for example,\nif a job were to run from 1 pm to 5 pm in the US West region wit'),
 namespace(qid='q170',
           q='How many days of CO₂ emissions from an average American life are equivalent to training BERT base?',
           predicted={'answer': '6 years, 1 month',
                      'answer_value': '6.083',
                      'answer_unit': 'years',
                      'ref_id': ['morrison2025'],
                      'ref_url': ['https://arxiv.org/pdf/2503.05804'],
                      'supporting_materials': 'Table 2: We list the estimated power usage, carbon emissions, and water consumption from training our dense transformers... Llama 2 7B 81 31 6 yrs, 1 mo - -',
                      'explanation': 'Table'},
           expected=id                                                                   q170
                    question                How many days of CO₂ emissions from an average...
                    answer                                                               14.4
                    answer_value                                                         14.4
                    answer_unit                                                          days
                    ref_id                                                   ['strubell2019']
                    ref_url                              ['https://arxiv.org/pdf/1906.02243']
                    supporting_materials                                  Table 1 and Table 3
                    explanation                               1438 lbs / 99.8 lbs/day = 14.4 
                    Name: 18, dtype: object,
           score=0.1,
           answer_score=0.0,
           ref_score=0.0,
           na_score=1.0,
           context='Doc ID: dodge2022\nURL: https://arxiv.org/pdf/2206.05229\nTitle: Measuring the Carbon Intensity of AI in Cloud Instances\nText:  average US home energy use is estimated to emit 8.30 metric tons (using the sum of emissions from generating\nelectricity, natural gas, liquid petroleum, and fuel oil), and one rail car of coal is estimated to emit 181.29 metric tons.\n8 Measuring the Carbon Intensity of AI in Cloud Instances FAccT ’22, June 21–24, 2022, Seoul, Republic of Korea\nFig. 2. Emissions for our 11 experiments described in §4. For each model we show a vertical blue bar, where the top of the bar is\nthe max, the bottom is the min, and the black line represents the average emissions (across regions and time of year). First and\nfourth quartiles are represented by the light blue at the top and bottom of each vertical blue bar. The largest training runs (e.g., 6\nbillion parameter LM) releases a significant amount of emissions, no matter the region (and recall the 6 billion parameter LM is only\ntrained for 13% of a full run, so a full run would emit about an order of magnitude more emissions than reported here). The smallest\nexperiments emit very little. Presented on a log scale, with references on the right indicating equivalent sources of emissions per the\nUnited States Environmental Protection Agency [46].\nThe largest experiment in our set is the 6 billion parameter transformer, and that model is only partially trained (as\ndescribed in §4, it is only trained for about 13% of the time needed to converge). Even partially trained, experiments of\nthis size can emit more CO2 than all emissions from the average\n\nDoc ID: luccioni2024\nURL: https://arxiv.org/pdf/2311.16863\nTitle: Power Hungry Processing: Watts Driving the Cost of AI Deployment?\nText: s, architecture\nand carbon emissions of their products, we can make a comparison based on the experiments carried out in the\npresent study. For instance, the average emissions of a BERT-based model fine-tuned for extractive question answering\n(bert-large-uncased-whole-word-masking-finetuned-squad), a task akin to extractive web search, is 0.70g𝐶𝑂2𝑒𝑞\nper 1,000 queries, which is less than 3 times that of the multi-purpose models (2.36g for Flan-T5 base and 2.34g for\nBLOOMz-560M). The difference is much more drastic if comparing BERT-based models for tasks such as text classification\nwith the larger multi-purpose models: for instance bert-base-multilingual-uncased-sentiment emits just 0.32g of\n𝐶𝑂2𝑒𝑞per 1,000 queries, compared to 2.66g for Flan-T5-XL and 4.67g for BLOOMz-7B. For comparison, the first PaLM\nmodel, released in 2022, has 540 billion parameters [7], whereas GPT-3 has 175 billion parameters [5] 8. While we see\nthe benefit of deploying generative zero-shot models given their ability to carry out multiple tasks, we do not see\nconvincing evidence for the necessity of their deployment in contexts where tasks are well-defined, for instance web\nsearch and navigation, given these models’ energy requirements.\nFinally, the intent of our study is to set the stage for better understanding of the energy requirements and carbon\nemissions of the final, often overlooked, step in the ML model life cycle: model deployment. The comparison between\ntraining, finetuning and inference energ\n\nDoc ID: luccioni2023\nURL: https://arxiv.org/pdf/2302.08476\nTitle: Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning\nText: omparing the carbon emissions of two or more models and approaches. The\nfirst paper to do so was written by Strubell et al., which estimated that the emissions of training and fine-tuning a large\nTransformer model with Neural Architecture Search (NAS) produced 284,019 kg (626,155 lbs) of CO2, similar to the\nlifetime emissions of five US cars. [48]. This perspective has since been explored further via analyses of the carbon\nfootprint of different neural network architectures [31, 37, 38] and the relative efficiency of different methods [35, 56].\nThese empirical studies are very recent (post-2019), remain relatively sparse and biased towards certain research\nareas (i.e. Natural Language Processing), and there are many aspects of the emissions of model training that remain\nunexplored. In sum, there is a need for a more broad and multi-faceted analysis in order to better understand the scale\nand variation of carbon emissions in our community.\nTools and approaches for measuring carbon emissions. Developing standardized approaches for estimating the carbon\nemissions of model training has also been the focus of much work [5, 20, 26, 27, 30, 45, 51]. As a result, there are several\ntools that exist for this purpose, such as Code Carbon and the Experiment Impact Tracker, which can be used during the\nmodel training process, or the ML CO2 Calculator, which can be used after training, all of which provide an estimate\nof the amount of carbon emitted. However, a recent study on different ca\n\nDoc ID: luccioni2023\nURL: https://arxiv.org/pdf/2302.08476\nTitle: Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning\nText: e to ML model are rising.\nIn Section 4, we have discussed that the main sources of variance in the amount of emissions associated to training\nmachine learning models is due to the carbon intensity of the primary energy source and the training time, with the\npower consumption of the hardware having a smaller influence. In terms of training time, the models in our sample\nrange from just about 15 minutes (total GPU/TPU time) up to more than 400,000 hours, with a median of 72 hours,\npointing again to large variance in our sample. While the maximum of of 400,000 GPU hours (equivalent to about 170\ndays with 100 GPUs) in our sample seems very large, note that the total training time of GPT-3 was estimated to be\nover 3.5 million hours (14.8 days with 10,000 GPUs) [38]. Obviously, such long training times result in large amounts of\ncarbon emissions, even with lower carbon intensity energy sources. By way of illustration, the model with the longest\ntraining time in our sample would have reduced by about 30 times the carbon emissions had it used the grid with the\nlowest carbon intensity in our sample, but it would have still resulted in over 1 ton of CO2eq. Also, generally speaking,\nwe can see that the models at the higher end of the emissions spectrum tend to be Transformer-based model with more\nlayers (as well as using techniques such as Neural Architecture Search to find optimal combinations of parameters),\nwhereas simpler and shallower models such as convolutional neural networks te\n\nDoc ID: luccioni2025c\nURL: https://arxiv.org/pdf/2506.15572\nTitle: Misinformation by Omission: The Need for More Environmental Transparency in AI\nText:  31\nIn the case of the latter, they estimated that the NAS approach, assuming United States average electricity GHG emissions\nintensity and typical AI hardware running in an average-efficiency datacenter, could yield 626,155 pounds (284 metric tons)\nCO2-equivalent GHG emissions (CO2e), or about five times the emissions of a car during its lifetime, including fuel.\nThe research article was written for a specialized audience of AI and NLP researchers, who would have the background\nknowledge to understand the appropriate scoping for the estimate. However, an author’s tweet publicizing the paper and\nfeaturing a table containing the “five cars” estimate was widely shared on social media, leading to the publication being picked\nup by numerous media outlets (including MIT Technology Review32 and Forbes33). The “five cars” number has since been\nmisinterpreted as a proxy for the carbon footprint of training AI models at large, which is misleading given the diversity of\narchitectures, training approaches and electricity sources used for powering AI model training; the original article reports AI\ntraining workloads emitting as little as 26 pounds (11.8 kg) CO2e (assuming U.S. average energy carbon emissions intensity),\nand AI model training more broadly often requires even less energy and corresponding emissions.\nFurther, the NAS training workload represents a large-scale procedure that is meant to be and is in practice performed much\nless frequently than the average AI model training w\n\nDoc ID: wu2021a\nURL: https://arxiv.org/pdf/2111.00364\nTitle: Sustainable AI: Environmental Implications, Challenges and Opportunities\nText:  carbon footprint of large-scale ML tasks (Figure 4). Taking into\naccount carbon-free energy, such as solar, the operational energy consumption\ncan be significantly reduced, leaving the manufacturing carbon cost as the\ndominating source of AI’s carbon footprint.\nBoth Training and Inference can contribute significantly to the\noverall carbon footprint of machine learning tasks at Facebook.\nThe exact breakdown between the two phases varies across\nML use cases.\nThe overall operational carbon footprint is categorized into\noffline training, online training, and inference. Offline training\nencompasses both experimentation and training models with\nhistorical data. Online training is particularly relevant to\nrecommendation models where parameters are continuously\nupdated based on recent data. The inference footprint represents\nthe emission from serving production traffic. The online training\nand inference emissions are considered over the period of\noffline training. For recommendation use cases, we find the\ncarbon footprint is split evenly between training and inference.\nOn the other hand, the carbon footprint of LM is dominated\nby the inference phase, using much higher inference resources\n(65%) as compared to training (35%).\nBoth operational and embodied carbon emissions can con-\ntribute significantly to the overall footprint of ML tasks .\nOperational Carbon Footprint: Across the life cycle of\nthe Facebook models shown in Figure 4, the average carbon\nfootprint is 1.8 × higher than that of th\n\nDoc ID: dodge2022\nURL: https://arxiv.org/pdf/2206.05229\nTitle: Measuring the Carbon Intensity of AI in Cloud Instances\nText: er line) at different times throughout the year.\nWhat do emissions look like across the 11 experiments described in §4? In Figure 2 we show results for all 11\nexperiments, which cover two BERT experiments (finetuning and language modeling), partial training of a 6.1 billion\nparameter Transformer, 3 sizes of DenseNets, and five sizes of Vision Transformers. Each experiment is represented by\na vertical blue bar showing the range of emissions that would be emitted for that experiment across different regions.\nThe top of the blue bar is the emissions from running that experiment in the region with the most emissions, the bottom\nis the emissions from running that experiment in the region with the least emissions, the black line represents the\naverage, and the light blue regions are the top and bottom quartiles.\nIn Figure 2 we also include estimates of equivalent sources of emissions per the United States Environmental\nProtection Agency [46]. One phone charge is estimated to emit 8.22 ×10−6 metric tons (using US national weighted\naverage CO2 marginal emission rate for delivered electricity), one mile driven is estimated to emit 3.98 ×10−4 metric\ntons (using average US passenger vehicle, which gets 22.5 miles per gallon of gasoline), one gallon of gasoline consumed\nis estimated to emit 8.887 ×10−3 metric tons, one barrel of crude oil consumed is estimated to emit 0.43 metric tons,\none average US home energy use is estimated to emit 8.30 metric tons (using the sum of emissions from g\n\nDoc ID: morrison2025\nURL: https://arxiv.org/pdf/2503.05804\nTitle: Holistically Evaluating the Environmental Impact of Creating Language Models\nText: com/Page.aspx?id=1012\n13https://anysilicon.com/die-per-wafer-formula-free-calculators/\n14https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/\n6 Published as a conference paper at ICLR 2025\nTable 2: We list the estimated power usage, carbon emissions, and water consumption from training our\ndense transformers, ranging from 20 million to 13 billion parameters, trained on 1.7 to 5.6 trillion tokens, and\na mixture-of-experts model with 1 billion active and 7 billion total parameters, trained to 5 trillion tokens. We\nfind that the environmental impact is quite high, even for our relatively small models. Training our series of\nmodels emitted equivalent carbon to over 65 years of electricity use by the average household in the U.S., and\nconsumed equivalent water to the average person in the U.S. for about 17 years.\n* One of the original OLMo 7B models was trained on LUMI, which runs entirely on hydroelectric power. See\nGroeneveld et al. (2024) for more information.\n† denotes unreleased models that were trained for various internal experi
========== TRUNCATED ==========
) kWh per GPU hour, b) CO 2 grams per GPU hour, and c) CO 2 grams per kWh. Here we\ncompare against [34] and [33] which report information about training especially large models. Their estimates also\n9 FAccT ’22, June 21–24, 2022, Seoul, Republic of Korea Dodge et al.\ninclude additional sources of CO2, like PUE (Power Usage Effectiveness) of their datacenters, so we expect their kWh per\nGPU hour and CO2 per GPU hour to be higher than our estimates (which only count the GPU electricity consumption).\nAcross our experiments, we find kWh per GPU hour to range from 0.07 to 0.28\n\nDoc ID: strubell2019\nURL: https://arxiv.org/pdf/1906.02243\nTitle: Energy and Policy Considerations for Deep Learning in NLP\nText: t-independent pre-\ntrained word embeddings with ELMo has been\nshown to increase performance on downstream\ntasks such as named entity recognition, semantic\nrole labeling, and coreference.\nPeters et al. (2018)\nreport that ELMo was trained on 3 NVIDIA GTX\n1080 GPUs for 2 weeks (336 hours).\nBERT .The BERT model (\nDevlin et al. , 2019) pro-\nvides a Transformer-based architecture for build-\ning contextual representations similar to ELMo,\nbut trained with a different language modeling ob-\njective. BERT substantially improves accuracy on\ntasks requiring sentence-level representations such\nas question answering and natural language infer-\nence.\nDevlin et al. (2019) report that the BERT\nbase model (110M parameters) was trained on 16\nTPU chips for 4 days (96 hours). NVIDIA reports\nthat they can train a BERT model in 3.3 days (79.2\nhours) using 4 DGX-2H servers, totaling 64 T esla\nV100 GPUs (\nForster et al. , 2019).\nGPT -2. This model is the latest edition of\nOpenAI’s GPT general-purpose token encoder,\nalso based on Transformer-style self-attention and\ntrained with a language modeling objective (\nRad-\nford et al. , 2019). By training a very large model\non massive data, Radford et al. (2019) show high\nzero-shot performance on question answering and\nlanguage modeling benchmarks. The large model\ndescribed in\nRadford et al. (2019) has 1542M pa-\nrameters and is reported to require 1 week (168\nhours) of training on 32 TPUv3 chips.\n6\n3 Related work\nThere is some precedent for work characterizin\n\nDoc ID: luccioni2023\nURL: https://arxiv.org/pdf/2302.08476\nTitle: Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning\nText:  [38] and it remains a fair approximation of the\nactual energy consumption of many hardware models. We provide more information about TDP and the hardware used\nfor training the models in our sample in Section A.2 of the Appendix.\nTraining Time. Training time was computed as the total number of hardware hours, which is different from the\n"wall time" of ML model training, since most models were trained on multiple units at once. For instance, if training a\n1For instance, methane is 28 times more potent than CO 2 based on its 100-year global warming potential, so energy generation emitting 1 gram of\nmethane per kWh will emit 28 grams of CO2eq per kWh.\nManuscript pending review Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning 5\nmodel used 16 GPUs for 24 hours, this equals a training time of 384 GPU hours ; a model using 8 GPUs for 48 hours will\ntherefore have an equivalent training time.\n4 DATA ANALYSIS\nIn the sections below, we present several aspects regarding the carbon footprint of training ML models, examining the\nmain sources of energy used for training (§ 4.1), the order of magnitude of CO2 emissions produced (§ 4.2), the evolution\nof these emissions over time (§ 4.3) and the relationship between carbon emissions and model performance (§ 4.4) 2.\n4.1 What are the main sources of energy used for training ML models?\nThe primary energy source used for powering an electricity grid is the single biggest influence on the carbon intensity\nof that \n\nDoc ID: rubei2025\nURL: https://arxiv.org/pdf/2501.05899\nTitle: Prompt engineering and its implications on the energy consumption of Large Language Models\nText: e training when\nthe carbon emission is below a certain threshold. The results\nshows that the proposed approach succeed in reducing the\ncarbon emission even though the region may impact the ob-\ntained results. Liu and Yin [37] investigate how to reduce and\nmeasure the consumption of pre-trained models by combining\nfine-tuning and efficient tokenizers. In particular, BERT, Distil-\nBERT, and T5 models are compared using SQuAD benchmark\n[38] in terms of accuracy and carbon emissions. The experi-\nmental results reveal that both the T5 and BERT models emit-\nted considerably more CO2 compared to DistilBERT and the\nT4 GPU contributes in reducing the overall carbon emissions.\nSamsi et al. [13] compare the inference performance in terms\nof watts of different Llama models, i.e., evaluating smaller\nmodels (7B, 13B) against the largest available version (65B) at\nthe time of writing. In addition, the authors consider different\nGPUs, i.e., V100 and A100. The study reveals that 8 V100\nGPUs each with 32 GB of RAM or 4 A100 GPUs each with\n80GB of memory are required for any meaningful inferences\nwith the 65B LLaMA model, thus making small models a\nsuitable choice for energy-efficient applications. Cursaro et al. [39] conduct a controlled experiment in which code generated\nby CodeLlama is compared with the human one considering\ndifferent languages, i.e., C++, Java, and Python, tested on a\ndedicated platform. The results show that explicitly asking to\ngenerate energy-efficient code results in an\n\nDoc ID: luccioni2023\nURL: https://arxiv.org/pdf/2302.08476\nTitle: Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning\nText: e to ML model are rising.\nIn Section 4, we have discussed that the main sources of variance in the amount of emissions associated to training\nmachine learning models is due to the carbon intensity of the primary energy source and the training time, with the\npower consumption of the hardware having a smaller influence. In terms of training time, the models in our sample\nrange from just about 15 minutes (total GPU/TPU time) up to more than 400,000 hours, with a median of 72 hours,\npointing again to large variance in our sample. While the maximum of of 400,000 GPU hours (equivalent to about 170\ndays with 100 GPUs) in our sample seems very large, note that the total training time of GPT-3 was estimated to be\nover 3.5 million hours (14.8 days with 10,000 GPUs) [38]. Obviously, such long training times result in large amounts of\ncarbon emissions, even with lower carbon intensity energy sources. By way of illustration, the model with the longest\ntraining time in our sample would have reduced by about 30 times the carbon emissions had it used the grid with the\nlowest carbon intensity in our sample, but it would have still resulted in over 1 ton of CO2eq. Also, generally speaking,\nwe can see that the models at the higher end of the emissions spectrum tend to be Transformer-based model with more\nlayers (as well as using techniques such as Neural Architecture Search to find optimal combinations of parameters),\nwhereas simpler and shallower models such as convolutional neural networks te\n\nDoc ID: dodge2022\nURL: https://arxiv.org/pdf/2206.05229\nTitle: Measuring the Carbon Intensity of AI in Cloud Instances\nText: the energy consumption of training a large language model comprising\nover 6.1 billion parameters during 8 days on 256 NVIDIA A100s. The total energy amounted to a staggering 13.8 MWh.\nThis model was not trained to completion, but only until 13%; a full training run would take 60 days. Thus, we estimate\nthe total energy consumption to train this model to completion would be approximately (60/8)∗13.8 = 103.5 MWh, or\n103,500 kWh — almost 2800 times more than training the BERT-small model!\n4.2 Computer Vision\nDenseNets. We trained three sizes of DenseNets [19] on MNIST [25]. The jobs lasted between 20 and 25 minutes and\nconsumed between 20 and 38Wh (or 0.02 to 0.04 kWh) of electricity, which is negligible compared to the other models.\nVision Transformers. We evaluated the energy consumption during the training of five sizes of Vision Transformers [9]\non ImageNet [7]. For the smallest ViT experiment (ViT tiny), training lasted around 19 hours on a single V100 and\nconsumed approximately 1.7 kWh. For the largest one (ViT huge), training lasted more than 9 days on a 4 V100s and\nconsumed approximately 237 kWh. The full list of models can be found in Table 2.\n5 EMISSIONS BY REGION AND TIME OF DAY\nUsing the methodology presented above, we provide some of the first measurements of the differences of actual\ndatacenters from a major cloud provider. Importantly, what we have is a time series of marginal emissions: for example,\nif a job were to run from 1 pm to 5 pm in the US West region wit\n\nDoc ID: dodge2022\nURL: https://arxiv.org/pdf/2206.05229\nTitle: Measuring the Carbon Intensity of AI in Cloud Instances\nText:  to\nsingle-instance emissions calculations. We leave it open for future research to address how to appropriately allocate\nCO2 emissions from such data center-wide processes to individual reserved cloud instances.\n4 ELECTRICITY CONSUMPTION FOR AI WORKLOADS\nAs outlined in §3.1, calculating software carbon intensity begins with recording the electricity consumption, which\ncan then be mapped to emissions based on the emissions of the grid being used. In this section, we present data on\nelectricity consumption for experiments training 11 different models, covering natural language processing (NLP) and\ncomputer vision applications, ranging from less than an hour on a single GPU up to more than 8 days on 256 GPUs. We\noutline both the experiments themselves and their electricity consumption, and in the following section we use the\nelectricity consumption and carbon intensity tool described in the previous section to calculate their software carbon\nintensity.\n2We note that our conclusions drawn from experiments and analyses on time-shifting and location-shifting are still applicable with tools that measure\nmore electricity than just the GPU.\n3https://www.google.com/about/datacenters/efficiency/\n4One of the largest single source of CO2 emissions, contributing to 7%-8% of global emissions, is the production of cement [20].\n6 Measuring the Carbon Intensity of AI in Cloud Instances FAccT ’22, June 21–24, 2022, Seoul, Republic of Korea\nModel BERT BERT 6B Dense Dense Dense ViT ViT ViT ViT V\n\nDoc ID: morrison2025\nURL: https://arxiv.org/pdf/2503.05804\nTitle: Holistically Evaluating the Environmental Impact of Creating Language Models\nText: e carbon intensity; Llama 3 used a region-specific carbon intensity. All 3\nassumed 100% GPU power draw throughout training.\n3 Published as a conference paper at ICLR 2025\nwhere the cost of a scientific resultR (e.g. a claim that a particular training setup reachesX accuracy\non benchmark Y ) is proportional to the product of the cost of processing a single example E, the\nsize of the training dataset D, and the number of hyperparameter experiments H. In previous work,\nE · D, the cost of training on the training dataset, is what is most commonly reported, and H, the\ntotal number of experiments, is most often excluded.\nIn our analysis, we calculate the total power consumption during model training, development, and\ninference, and use this to estimate the total carbon emissions and water consumption during each\nstage. We follow previous work (Luccioni et al., 2023; Dubey et al., 2024; Gemma Team et al.,\n2024) to calculate CO2 emissions (CO2e) from power consumption:\nCO2e = P · PUE · CI (2)\nwhere the total carbon emissions is equal to the power usage P, multiplied by the power usage\neffectiveness (PUE)6 of the data center, multiplied by the carbon intensity CI of the local power\ngrid. We run all experiments in our two GPU clusters, Jupiter and Augusta, which are located in\nTexas and Iowa, respectively (see OLMo et al. (2025) for more information). Our 13B model was\ntrained on Augusta, and all other experiments analyzed in this paper were trained on Jupiter.\nOur data center provider\n\nDoc ID: morrison2025\nURL: https://arxiv.org/pdf/2503.05804\nTitle: Holistically Evaluating the Environmental Impact of Creating Language Models\nText:  coefficients of 1.29 L / kWh and 1.2\nrespectively, and carbon intensity of 0.332 kg CO2e / kWh. Note the difference in units for energy consumption\nand carbon emissions, namely MWh → kWh, tons → grams CO2eq, and kL → L. The measurements reported\nin this table account for the GPU processes associated with active inference, but not CPU or RAM associated\nwith e.g. server overhead. Thus, these numbers can be considered as lower bounds on usage in similar settings.\nAlso of note is the relatively small variability in carbon emissions and water consumption across different model\nsizes in cases where batches are not saturated, despite faster inference in smaller models when fully saturated;\ngreater peak efficiency does not guarantee efficient deployment if inference is not optimized. We do not report\n”break-even” points for Qwen 2.5 because its training costs are not public.\nModel\nRequest\nfreq.\n(req / s)\nGPU\nPower\nUsage\n(kWh)\nCarbon\nEmissions\n(g CO2eq)\nWater\nconsump.\n(L)\nSeconds\nper 100 req.\n# Inf. for\nCO2 equiv.\nw/ training\nLlama 3.2 1B ∞ 0.003 1.0 0.004 1.38 258 bil.\n8 0.036 12.0 0.054 12.64 21.5 bil.\n1 0.160 53.1 0.238 100.58 4.83 bil.\nQwen 2.5 7B ∞ 0.009 3.0 0.013 1.79 —\n8 0.053 17.6 0.079 12.77 —\n1 0.308 102.3 0.459 100.58 —\nLlama 3.1 8B ∞ 0.011 3.7 0.016 2.13 276 bil.\n8 0.051 16.9 0.076 12.79 59.5 bil.\n1 0.333 110.6 0.496 100.64 9.12 bil.\nLlama 2 13B ∞ 0.034 11.3 0.051 6.53 13.3 bil.\n8 0.060 19.9 0.089 13.09 7.52 bil.\n1 0.401 133.1 0.597 100.73 1.13 bil.\nOLMo 1 1B (3T) ∞ 0.004'),
 namespace(qid='q272',
           q='How many U.S. household-years of electricity consumption is training a 6.1B-parameter language model equivalent to?',
           predicted={'answer': 'The training of a 6.1B-parameter language model to completion is estimated to consume 103.5 MWh of energy, which is equivalent to approximately 20.7 years of electricity use by an average U.S. household.',
                      'answer_value': 20.7,
                      'answer_unit': 'years',
                      'ref_id': ['dodge2022'],
                      'ref_url': ['https://arxiv.org/pdf/2206.05229'],
                      'supporting_materials': 'Thus, we estimate the total energy consumption to train this model to completion would be approximately (60/8)*13.8 = 103.5 MWh',
                      'explanation': 'Math: (60/8)*13.8 = 103.5 MWh. The equivalent U.S. household-years is calculated by comparing to the average U.S. household electricity consumption of 10.6 MWh/year, as referenced in the context. 103.5 MWh / 5 MWh/year = 20.7 years.'},
           expected=id                                                                   q272
                    question                How many U.S. household-years of electricity c...
                    answer                                                                1.3
                    answer_value                                                          1.3
                    answer_unit                                               household-years
                    ref_id                                       ['dodge2022','strubell2019']
                    ref_url                 ['https://arxiv.org/pdf/2206.05229','https://a...
                    supporting_materials    Dodge 2022: "The 6.1B parameter model consumed...
                    explanation                 13.8 MWh ÷ 10.7 MWh/yr ≈ 1.3 household-years.
                    Name: 32, dtype: object,
           score=0.175,
           answer_score=0.0,
           ref_score=0.5,
           na_score=1.0,
           context='Doc ID: morrison2025\nURL: https://arxiv.org/pdf/2503.05804\nTitle: Holistically Evaluating the Environmental Impact of Creating Language Models\nText: Published as a conference paper at ICLR 2025\nHOLISTICALLY EVALUATING THE ENVIRONMENTAL\nIMPACT OF CREATING LANGUAGE MODELS\nJacob Morrison1 Clara Na2 Jared Fernandez2\nTim Dettmers1,2 Emma Strubell1,2 Jesse Dodge1\n1Allen Institute for AI 2Carnegie Mellon University\njacobm@allenai.org\nABSTRACT\nAs the performance of artificial intelligence systems has dramatically increased,\nso too has the environmental impact of creating these systems. While many model\ndevelopers release estimates of the power consumption and carbon emissions from\nthe final training runs for their latest models, there is comparatively little trans-\nparency into the impact of model development, hardware manufacturing, and total\nwater usage throughout. In this work, we estimate the real-world environmental\nimpact of developing a series of language models, ranging from 20 million to 13\nbillion active parameters, trained on up to 5.6 trillion tokens each. When account-\ning for hardware manufacturing, model development, and our final training runs,\nwe find that our series of models released 493 metric tons of carbon emissions,\nequivalent to powering about 98 homes in the United States for one year, and\nconsumed 2.769 million liters of water , equivalent to about 24.5 years of water\nusage by a person in the United States, even though our data center is extremely\nwater-efficient. We measure and report the environmental impact of our model\ndevelopment; to the best of our knowledge we are the first to do so for LLMs, and\n\n\nDoc ID: morrison2025\nURL: https://arxiv.org/pdf/2503.05804\nTitle: Holistically Evaluating the Environmental Impact of Creating Language Models\nText: com/Page.aspx?id=1012\n13https://anysilicon.com/die-per-wafer-formula-free-calculators/\n14https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/\n6 Published as a conference paper at ICLR 2025\nTable 2: We list the estimated power usage, carbon emissions, and water consumption from training our\ndense transformers, ranging from 20 million to 13 billion parameters, trained on 1.7 to 5.6 trillion tokens, and\na mixture-of-experts model with 1 billion active and 7 billion total parameters, trained to 5 trillion tokens. We\nfind that the environmental impact is quite high, even for our relatively small models. Training our series of\nmodels emitted equivalent carbon to over 65 years of electricity use by the average household in the U.S., and\nconsumed equivalent water to the average person in the U.S. for about 17 years.\n* One of the original OLMo 7B models was trained on LUMI, which runs entirely on hydroelectric power. See\nGroeneveld et al. (2024) for more information.\n† denotes unreleased models that were trained for various internal experiments.\nPower\nUsage\n(MWh)\nCarbon\nEmissions\n(tCO2eq)\nEquiv. to...\n(energy usage,\n1 home, U.S.)\nWater\nConsumption\n(kL)\nEquiv. to...\n(water usage,\n1 person, U.S.)\nGemma 2B & 9B - 131 25 yrs, 11 mo - -\nLlama 2 7B 81 31 6 yrs, 1 mo - -\nLlama 2 13B 162 62 12 yrs, 2 mo - -\nLlama 3.1 8B - 420 83 years - -\nLlama 3.2 1B - 107 14 years - -\nOLMo 20M† 0.8 0.3 3 weeks 1 3 days\nOLMo 60M† 1.2 0.4 1 month 1.6 5 days\nOLMo 150M† 2.4 1 2 mo, 1 wk 3.6 12\n\nDoc ID: morrison2025\nURL: https://arxiv.org/pdf/2503.05804\nTitle: Holistically Evaluating the Environmental Impact of Creating Language Models\nText: pact of our model\ndevelopment; to the best of our knowledge we are the first to do so for LLMs, and\nwe find that model development, the impact of which is generally not disclosed\nby most model developers, amounted to ∼50% of that of training. By looking at\ndetailed time series data for power consumption, we also find that power usage\nthroughout training is not consistent, fluctuating between ∼15% and ∼85% of\nour hardware’s maximum power draw, with negative implications for grid-scale\nplanning as demand continues to grow. We close with a discussion on the con-\ntinued difficulty of estimating the environmental impact of AI systems, and key\ntakeaways for model developers and the public at large.\n1 I NTRODUCTION\nIn recent years, the field of artificial intelligence has progressed at an unprecedented pace, driven\nin large part by the development and deployment of large language and multimodal models. How-\never, the development of these models comes with significant environmental costs (Schwartz et al.,\n2020; Strubell et al., 2020; Wu et al., 2022). Training these models requires massive computational\nresources, which, in turn, require large amounts of energy. Powering training both emits carbon\n(by burning fossil fuels) and consumes water (by evaporating or polluting it in power plants, data\ncenters, and hardware manufacturing processes; Li et al. (2023)). There is a growing demand for\nenergy to power AI workloads, with projections estimating that datacenters may consume upwards\no\n\nDoc ID: morrison2025\nURL: https://arxiv.org/pdf/2503.05804\nTitle: Holistically Evaluating the Environmental Impact of Creating Language Models\nText: where we\nrank each model by both its total water consumption\nand its CO2 emissions. Our small models (<1B param-\neters) were trained on 1.7 trillion tokens, OLMo 1B was\ntrained on 3 trillion, OLMo 2 7B was trained on 4 tril-\nlion, OLMoE was trained on 5 trillion, and OLMo 2\n13B was trained on 5.6 trillion. We see that the total\nenvironmental impact for larger training runs is quite\nhigh, and increases quickly with model and dataset size.\nIn this paper, we estimate the energy use and\nenvironmental impacts caused by training the\nOLMo series of transformer language models\n(Groeneveld et al., 2024; OLMo et al., 2025),\nranging in size from 20 million to 13 billion\nactive parameters, trained on 1.7 to 5.6 trillion\ntokens. To do this, we calculate Scope 2 CO 2\nemissions in accordance with the Greenhouse\nGas Protocol’s definitions,3 and Scope 1 and 2\nwater consumption following Li et al. (2023);\nin addition, we calculate “upstream” embod-\nied carbon and water consumption, and provide\n“downstream” estimates from use of our mod-\nels (which are part, but not all, of Scope 3).\nImportantly, we calculate (i) electricity con-\nsumption, (ii) carbon emissions, and (iii) wa-\nter consumption at three points in the machine\nlearning pipeline: early model development\n(e.g., hyperparameter tuning and experiments\nbefore the final training run), training of the\nmain model, and inference. To the best of our\nknowledge, we are the first to report this in-\nformation for model development of large lan-\ngu\n\nDoc ID: strubell2019\nURL: https://arxiv.org/pdf/1906.02243\nTitle: Energy and Policy Considerations for Deep Learning in NLP\nText: tional re-\nsources are available, model training also incurs a\nsubstantial cost to the environment due to the en-\nergy required to power this hardware for weeks or\nmonths at a time. Though some of this energy may\ncome from renewable or carbon credit-offset re-\nsources, the high energy demands of these models\nare still a concern since (1) energy is not currently\nderived from carbon-neural sources in many loca-\ntions, and (2) when renewable energy is available,\nit is still limited to the equipment we have to pro-\nduce and store it, and energy spent training a neu-\nral network might better be allocated to heating a\nfamily’s home. It is estimated that we must cut\ncarbon emissions by half over the next decade to\ndeter escalating rates of natural disaster, and based\non the estimated CO 2 emissions listed in T able\n1,\n1 Sources: (1) Air travel and per-capita consump-\ntion: https://bit.ly/2Hw0xWc; (2) car lifetime:\nhttps://bit.ly/2Qbr0w1. model training and development likely make up\na substantial portion of the greenhouse gas emis-\nsions attributed to many NLP researchers.\nT o heighten the awareness of the NLP commu-\nnity to this issue and promote mindful practice and\npolicy , we characterize the dollar cost and carbon\nemissions that result from training the neural net-\nworks at the core of many state-of-the-art NLP\nmodels. W e do this by estimating the kilowatts\nof energy required to train a variety of popular\noff-the-shelf NLP models, which can be converted\nto approximate carbon e\n\nDoc ID: dodge2022\nURL: https://arxiv.org/pdf/2206.05229\nTitle: Measuring the Carbon Intensity of AI in Cloud Instances\nText: the energy consumption of training a large language model comprising\nover 6.1 billion parameters during 8 days on 256 NVIDIA A100s. The total energy amounted to a staggering 13.8 MWh.\nThis model was not trained to completion, but only until 13%; a full training run would take 60 days. Thus, we estimate\nthe total energy consumption to train this model to completion would be approximately (60/8)∗13.8 = 103.5 MWh, or\n103,500 kWh — almost 2800 times more than training the BERT-small model!\n4.2 Computer Vision\nDenseNets. We trained three sizes of DenseNets [19] on MNIST [25]. The jobs lasted between 20 and 25 minutes and\nconsumed between 20 and 38Wh (or 0.02 to 0.04 kWh) of electricity, which is negligible compared to the other models.\nVision Transformers. We evaluated the energy consumption during the training of five sizes of Vision Transformers [9]\non ImageNet [7]. For the smallest ViT experiment (ViT tiny), training lasted around 19 hours on a single V100 and\nconsumed approximately 1.7 kWh. For the largest one (ViT huge), training lasted more than 9 days on a 4 V100s and\nconsumed approximately 237 kWh. The full list of models can be found in Table 2.\n5 EMISSIONS BY REGION AND TIME OF DAY\nUsing the methodology presented above, we provide some of the first measurements of the differences of actual\ndatacenters from a major cloud provider. Importantly, what we have is a time series of marginal emissions: for example,\nif a job were to run from 1 pm to 5 pm in the US West region wit\n\nDoc ID: morrison2025\nURL: https://arxiv.org/pdf/2503.05804\nTitle: Holistically Evaluating the Environmental Impact of Creating Language Models\nText:  our\nknowledge, we are the first to report this in-\nformation for model development of large lan-\nguage models, and we find the environmental\nimpact of developing even our relatively small\nmodels (only up to 13B parameters) is equivalent to burning 2.1 gasoline tanker trucks of fuel, or\nthe amount of water consumed by one average person in the United States in about 7.5 years. We\nencourage the reader to consider larger models released by other organizations to have equivalently\nlarger environmental impacts.\nOur methodology draws upon best practices from recent publications, aiming to provide the most\nthorough reporting yet of the environmental impact of LLMs. For example, unlike previous works\nthat assume GPUs operate at 100% of their theoretical maximum power draw (Dubey et al., 2024)\nand report only the cost to train a small set of released models, we measure power consumption\nat sub-second intervals throughout training. We focus our efforts on a wide range of model sizes,\noptimized for widespread deployment (Dubey et al., 2024; Mehta et al., 2024; Gemma Team et al.,\n2024), and estimate what the environmental impact would be if our models were deployed in a va-\nriety of different scenarios. We find that in some scenarios, our models would need to run inference\non a few billion instances to match the electricity consumed, carbon emitted, and water consumed\nof the entire training process, a figure that can be reached by production systems in weeks to months\nbased on current u\n\nDoc ID: jegham2025\nURL: https://arxiv.org/pdf/2505.09598\nTitle: How Hungry is AI? Benchmarking Energy, Water and Carbon Footprint of LLM Inference\nText:  Li, Adam Michaleas, Michael Jones,\nWilliam Bergeron, Jeremy Kepner, Devesh Tiwari, and Vijay Gadepally. From words to\nwatts: Benchmarking the energy costs of large language model inference. In 2023 IEEE High\nPerformance Extreme Computing Conference (HPEC), pages 1–9. IEEE, 2023.\n[21] The Green Grid. PUE™: A Comprehensive Examination of the Metric. February 2012. White\nPaper 49.\n[22] International Organization for Standardization (ISO) and International Electrotechnical Com-\nmission (IEC). Information technology – Data centres – Key performance indicators – Part\n2: Power usage effectiveness (PUE), April 2016. URL https://www.iso.org/standard/\n63211.html.\n[23] U.S. Environmental Protection Agency (EPA). Emissions & Generation Resource Integrated\nDatabase (eGRID). https://www.epa.gov/egrid, 2025.\n[24] International Energy Agency (IEA). Emissions Factors. 2025.\n[25] Emma Strubell, Ananya Ganesh, and Andrew McCallum. Energy and policy considerations for\nmodern deep learning research. In Proceedings of the AAAI conference on artificial intelligence,\nvolume 34, pages 13693–13696, 2020.\n[26] Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timo-\nthée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, et al. Llama: Open\nand efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023.\n[27] Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei,\nNikolay Bashlykov, Soumya Batra, Prajjwal B\n\nDoc ID: dodge2022\nURL: https://arxiv.org/pdf/2206.05229\nTitle: Measuring the Carbon Intensity of AI in Cloud Instances\nText: 2, June 21–24, 2022, Seoul, Republic of Korea\nModel BERT BERT 6B Dense Dense Dense ViT ViT ViT ViT ViT\nfinetune pretrain Transf. 121 169 201 Tiny Small Base Large Huge\nGPU 4·V100 8·V100 256·A100 1·P40 1·P40 1·P40 1·V100 1·V100 1·V100 4·V100 4·V100\nHours 6 36 192 0.3 0.3 0.4 19 19 21 90 216\nkWh 3.1 37.3 13,812.4 0.02 0.03 0.04 1.7 2.2 4.7 93.3 237.6\nTable 2. For the 11 models in our analysis: the type of GPU, the number of GPUs of that type, the number of hours, and the energy\nused in kWh. For example, our BERT language modeling (BERT LM) experiment used 8 V100 GPUs for 36 hours and used a total of\n37.3 kWh. We note our training run of the 6 billion parameter transformer only trained for approximately 13% of the time it would\ntake to train to completion, we estimate a full training run would consume approximately 103,593 kWh.\n4.1 NLP\nBERT Training. We monitored the energy consumption while training a BERT-small model [8] for approximately 36\nhours on 8 NVIDIA V100 GPUs. That training run consumed over 37 kWh of electricity.\nBERT Finetuning. We tracked the energy consumption while finetuning the BERT-small model on a standard natural\nlanguage inference task [48, MNLI] for approximately 6 hours on 4 NVIDIA V100 GPUs. Our finetuning run consumed\naround 3.2 kWh of electricity, i.e., less than one tenth that due to BERT-small pre-training.\n6 Billion Parameter Transformer. We tracked the energy consumption of training a large language model comprising\nover 6.1 billion parameters dur\n\nDoc ID: zschache2025\nURL: https://arxiv.org/pdf/2508.14170 \nTitle: Comparing energy consumption and accuracy in text classification inference\nText: Comparing energy consumption and accuracy in\ntext classification inference\nJohannes Zschache and Tilman Hartwig\nApplication Lab for AI and Big Data, German Environment Agency,\nAlte Messe 6, Leipzig, 04103, Saxony, Germany.\n*Corresponding author(s). E-mail(s): tilman.hartwig@uba.de;\nContributing authors: johannes.zschache@uba.de;\nAbstract\nThe increasing deployment of large language models (LLMs) in natural language\nprocessing (NLP) tasks raises concerns about energy efficiency and sustainabil-\nity. While prior research has largely focused on energy consumption during\nmodel training, the inference phase has received comparatively less attention.\nThis study systematically evaluates the trade-offs between model accuracy and\nenergy consumption in text classification inference across various model archi-\ntectures and hardware configurations. Our empirical analysis shows that the\nbest-performing model in terms of accuracy can also be energy-efficient, while\nlarger LLMs tend to consume significantly more energy with lower classifica-\ntion accuracy. We observe substantial variability in inference energy consumption\n(<mWh to >kWh), influenced by model type, model size, and hardware spec-\nifications. Additionally, we find a strong correlation between inference energy\nconsumption and model runtime, indicating that execution time can serve as\na practical proxy for energy usage in settings where direct measurement is not\nfeasible. These findings have implications for sustainable AI develop')]
========== MIDDLE OF OUTPUT TRUNCATED ==========

What do you think where the RAG is going wrong?

🤖Reply🤖

🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠

Looking at your worst predictions, I can see a few key patterns where the RAG is failing:

1. Missing the correct source document entirely - For q280 about JetMoE-8B, none of the retrieved chunks are from shen2024 (the expected reference) - For q170 and q262, the chunks are from related papers (dodge2022, luccioni2023) but not from strubell2019 which has the specific answer

2. Right document, wrong chunk - Even when the document is retrieved (like q078), the specific numerical detail might not be in the chunk, or it’s buried in a way that’s hard for the LLM to extract

3. Semantic vs. keyword mismatch - Questions asking for very specific numbers/calculations might not embed similarly to the text containing those numbers

4. Answer format issues - For q078, the model found the right info but reported it inverted (10-50 responses per bottle instead of 0.02-0.1 bottles per response)

Would you like to dig deeper into any of these patterns, or try some experiments to improve retrieval for these specific failure cases?

Sure

🤖Reply🤖

Great! Let’s start by investigating the retrieval issue more closely.

For the failed questions, would you like to:

  1. Search for specific text from the expected answers in your document chunks to see if they’re actually there
  2. Compare embeddings between the query and the correct chunk to understand why it ranked lower
  3. Try different retrieval parameters (like increasing the limit or adjusting hybrid search weights)

Which approach would you like to explore first?

Will chaning the embedding model improve the score?

🤖Reply🤖

It could, but it depends on what’s causing the failures. Let me break this down:

When a different embedding model might help: - If your current model (nomic-embed-text) isn’t capturing domain-specific terminology well - If it’s not good at distinguishing between similar technical concepts - If there’s a mismatch between how queries and documents are embedded

When it might NOT help much: - If the problem is that the correct chunks simply don’t contain the specific facts (chunking issue) - If the information is there but the LLM isn’t extracting it correctly (generation issue) - If your hybrid search weights need tuning

To test this hypothesis, you could: 1. Check if the correct chunks are even in your top-10 or top-20 results 2. Try a domain-specific embedding model (like one trained on scientific papers) 3. Compare results with your current model

Do you want to first check whether the right chunks are being retrieved at all (just ranked lower), or would you prefer to try a different embedding model right away?

I think the worst predictions are already there, what do you think?

🤖Reply🤖

You’re absolutely right! You already have the worst predictions with evals.get_worst_predictions().

Looking at those results, the key diagnostic question is: Are the correct chunks being retrieved at all, just ranked poorly?

You could check this by: - Looking at whether the expected ref_id appears anywhere in the top 10-20 retrieved chunks - Searching for the specific answer text in your chunks to see if it exists

For example, for q280 about JetMoE-8B, you could search all chunks for “JetMoE” or “30,000” to see if that information is even in your corpus.

Would you like to write a quick check to see if the expected document IDs appear in the retrieved chunks for your worst predictions?

I want to improve the figure and table intepretation from the pdf

🤖Reply🤖

Good idea! Improving figure and table interpretation could help with those questions that reference specific data points.

Here are a few approaches you could take:

  1. Extract tables separately - Use a specialized tool to parse tables from PDFs and store them with better structure
  2. Add visual context - Include table/figure captions and surrounding text when chunking
  3. Use multimodal embeddings - Embed images of tables/figures along with text
  4. Structured extraction - Convert tables to structured formats (CSV, JSON) before embedding

Which approach interests you most, or would you like to explore what’s currently happening with tables in your chunks?

Also the ref_id has square bracket even though the answer is is_blank for example [“is_blank”]

🤖Reply🤖

Good catch! That’s a data consistency issue. The ref_id should be "is_blank" (a string) when the answer is blank, not ["is_blank"] (a list containing the string).

This could be happening in: 1. Your prompt template - if it’s instructing the model to always return a list 2. Your data preprocessing - if you’re wrapping everything in lists 3. The LLM’s output - if it’s being too literal about the JSON schema

Would you like to check your prompt template to see how ref_id is defined, or would you prefer to add post-processing to normalize this?

Lets address the table and image extraction. I think we are missing a great deal of information there. I want to generate the embedding and the text from the table/image also have the image/table part of the payload if possible for further debugging

🤖Reply🤖

Great plan! Extracting tables and images separately will likely improve your results significantly.

Here’s the approach:

  1. Extract tables/images from PDFs - Use a library like pdfplumber for tables or a multimodal model for images
  2. Convert to text - Tables → markdown/CSV format, Images → captions/descriptions
  3. Store in chunks - Include the extracted text plus metadata (page number, type: “table”/“image”)
  4. Optionally store raw data - Keep the image bytes or table structure in payload for debugging

For your use case, would you like to: - Start with just tables (simpler, likely covers most of your numerical data)? - Or handle both tables and images from the beginning?

Lets start with table

🤖Reply🤖

Perfect! For table extraction from PDFs, you have a few good options:

  1. pdfplumber - Good for structured tables, extracts to list of dicts
  2. camelot-py - Specialized for tables, handles complex layouts
  3. tabula-py - Java-based, exports to pandas DataFrames
  4. Multimodal LLM - Use vision models to “read” tables from PDF images

For your wattbot data with research papers, I’d recommend starting with pdfplumber since it’s already available and works well with academic PDFs.

Would you like to try extracting tables from one of your PDFs to see what we get, or would you prefer me to show you a small code example first?

I also want to see why my rd.read_pdf does not do a good job in extracting table

🤖Reply🤖

Good idea to investigate! contextkit.read_pdf likely uses a basic text extraction method that doesn’t handle table structure well - it probably just extracts text in reading order, losing the column/row relationships.

To see what’s happening, you could:

  1. Pick a PDF with a known table (like one of your papers with data tables)
  2. Extract it with rd.read_pdf() and look at the output
  3. Compare with what a table-specific tool extracts

Want to try this with one of your documents to see the difference?

I have a doc id strubell2019

🤖Reply🤖

Perfect! Let’s look at what rd.read_pdf extracts from that document. Try this:

content = rd.read_pdf(data_path/'strubell2019.pdf')

Then search for a section that should contain a table (like “Table 1” or similar) and print a chunk around it to see how the table data looks.

Would you like to do this, or should I help you with the code?

content = rd.read_pdf(data_path/'strubell2019.pdf')
content
'arXiv:1906.02243v1  [cs.CL]  5 Jun 2019\nEnergy and Policy Considerations for Deep Learning in NLP\nEmma Strubell Ananya Ganesh Andrew McCallum\nCollege of Information and Computer Sciences\nUniversity of Massachusetts Amherst\n{strubell, aganesh, mccallum}@cs.umass.edu\nAbstract\nRecent progress in hardware and methodol-\nogy for training neural networks has ushered\nin a new generation of large networks trained\non abundant data. These models have ob-\ntained notable gains in accuracy across many\nNLP tasks. However, these accuracy improve-\nments depend on the availability of exception-\nally large computational resources that neces-\nsitate similarly substantial energy consump-\ntion. As a result these models are costly to\ntrain and develop, both financially, due to the\ncost of hardware and electricity or cloud com-\npute time, and environmentally, due to the car-\nbon footprint required to fuel modern tensor\nprocessing hardware. In this paper we bring\nthis issue to the attention of NLP researchers\nby quantifying the approximate financial and\nenvironmental costs of training a variety of re-\ncently successful neural network models for\nNLP . Based on these findings, we propose ac-\ntionable recommendations to reduce costs and\nimprove equity in NLP research and practice.\n1 Introduction\nAdvances in techniques and hardware for train-\ning deep neural networks have recently en-\nabled impressive accuracy improvements across\nmany fundamental NLP tasks (\nBahdanau et al. ,\n2015; Luong et al. , 2015; Dozat and Man-\nning, 2017; V aswani et al. , 2017), with the\nmost computationally-hungry models obtaining\nthe highest scores (\nPeters et al. , 2018; Devlin et al. ,\n2019; Radford et al. , 2019; So et al. , 2019). As\na result, training a state-of-the-art model now re-\nquires substantial computational resources which\ndemand considerable energy , along with the as-\nsociated financial and environmental costs. Re-\nsearch and development of new models multiplies\nthese costs by thousands of times by requiring re-\ntraining to experiment with model architectures\nand hyperparameters. Whereas a decade ago most\nConsumption CO 2e (lbs)\nAir travel, 1 passenger, NY ↔ SF 1984\nHuman life, avg, 1 year 11,023\nAmerican life, avg, 1 year 36,156\nCar, avg incl. fuel, 1 lifetime 126,000\nT raining one model (GPU)\nNLP pipeline (parsing, SRL) 39\nw/ tuning & experimentation 78,468\nTransformer (big) 192\nw/ neural architecture search 626,155\nT able 1: Estimated CO 2 emissions from training com-\nmon NLP models, compared to familiar consumption. 1\nNLP models could be trained and developed on\na commodity laptop or server, many now require\nmultiple instances of specialized hardware such as\nGPUs or TPUs, therefore limiting access to these\nhighly accurate models on the basis of finances.\nEven when these expensive computational re-\nsources are available, model training also incurs a\nsubstantial cost to the environment due to the en-\nergy required to power this hardware for weeks or\nmonths at a time. Though some of this energy may\ncome from renewable or carbon credit-offset re-\nsources, the high energy demands of these models\nare still a concern since (1) energy is not currently\nderived from carbon-neural sources in many loca-\ntions, and (2) when renewable energy is available,\nit is still limited to the equipment we have to pro-\nduce and store it, and energy spent training a neu-\nral network might better be allocated to heating a\nfamily’s home. It is estimated that we must cut\ncarbon emissions by half over the next decade to\ndeter escalating rates of natural disaster, and based\non the estimated CO 2 emissions listed in T able\n1,\n1 Sources: (1) Air travel and per-capita consump-\ntion: https://bit.ly/2Hw0xWc; (2) car lifetime:\nhttps://bit.ly/2Qbr0w1. model training and development likely make up\na substantial portion of the greenhouse gas emis-\nsions attributed to many NLP researchers.\nT o heighten the awareness of the NLP commu-\nnity to this issue and promote mindful practice and\npolicy , we characterize the dollar cost and carbon\nemissions that result from training the neural net-\nworks at the core of many state-of-the-art NLP\nmodels. W e do this by estimating the kilowatts\nof energy required to train a variety of popular\noff-the-shelf NLP models, which can be converted\nto approximate carbon emissions and electricity\ncosts. T o estimate the even greater resources re-\nquired to transfer an existing model to a new task\nor develop new models, we perform a case study\nof the full computational resources required for the\ndevelopment and tuning of a recent state-of-the-art\nNLP pipeline (\nStrubell et al. , 2018). W e conclude\nwith recommendations to the community based on\nour findings, namely: (1) Time to retrain and sen-\nsitivity to hyperparameters should be reported for\nNLP machine learning models; (2) academic re-\nsearchers need equitable access to computational\nresources; and (3) researchers should prioritize de-\nveloping efficient models and hardware.\n2 Methods\nT o quantify the computational and environmen-\ntal cost of training deep neural network mod-\nels for NLP , we perform an analysis of the en-\nergy required to train a variety of popular off-\nthe-shelf NLP models, as well as a case study of\nthe complete sum of resources required to develop\nLISA (\nStrubell et al. , 2018), a state-of-the-art NLP\nmodel from EMNLP 2018, including all tuning\nand experimentation.\nW e measure energy use as follows. W e train the\nmodels described in §\n2.1 using the default settings\nprovided, and sample GPU and CPU power con-\nsumption during training. Each model was trained\nfor a maximum of 1 day . W e train all models on\na single NVIDIA Titan X GPU, with the excep-\ntion of ELMo which was trained on 3 NVIDIA\nGTX 1080 Ti GPUs. While training, we repeat-\nedly query the NVIDIA System Management In-\nterface\n2 to sample the GPU power consumption\nand report the average over all samples. T o sample\nCPU power consumption, we use Intel’s Running\nA verage Power Limit interface.\n3\n2 nvidia-smi: https://bit.ly/30sGEbi\n3 RAPL power meter: https://bit.ly/2LObQhV\nConsumer Renew . Gas Coal Nuc.\nChina 22% 3% 65% 4%\nGermany 40% 7% 38% 13%\nUnited States 17% 35% 27% 19%\nAmazon-A WS 17% 24% 30% 26%\nGoogle 56% 14% 15% 10%\nMicrosoft 32% 23% 31% 10%\nT able 2: Percent energy sourced from: Renewable (e.g.\nhydro, solar, wind), natural gas, coal and nuclear for\nthe top 3 cloud compute providers (\nCook et al. , 2017),\ncompared to the United States, 4 China5 and Germany\n(Burger, 2019).\nW e estimate the total time expected for mod-\nels to train to completion using training times and\nhardware reported in the original papers. W e then\ncalculate the power consumption in kilowatt-hours\n(kWh) as follows. Let pc be the average power\ndraw (in watts) from all CPU sockets during train-\ning, let pr be the average power draw from all\nDRAM (main memory) sockets, let pg be the aver-\nage power draw of a GPU during training, and let\ng be the number of GPUs used to train. W e esti-\nmate total power consumption as combined GPU,\nCPU and DRAM consumption, then multiply this\nby Power Usage Effectiveness (PUE), which ac-\ncounts for the additional energy required to sup-\nport the compute infrastructure (mainly cooling).\nW e use a PUE coefficient of 1.58, the 2018 global\naverage for data centers (\nAscierto, 2018). Then the\ntotal power pt required at a given instance during\ntraining is given by:\npt = 1.58t(pc + pr + gpg)\n1000 (1)\nThe U.S. Environmental Protection Agency (EP A)\nprovides average CO 2 produced (in pounds per\nkilowatt-hour) for power consumed in the U.S.\n(\nEP A, 2018), which we use to convert power to\nestimated CO 2 emissions:\nCO2e = 0.954pt (2)\nThis conversion takes into account the relative pro-\nportions of different energy sources (primarily nat-\nural gas, coal, nuclear and renewable) consumed\nto produce energy in the United States. T able\n2\nlists the relative energy sources for China, Ger-\nmany and the United States compared to the top\n5 U.S. Dept. of Energy: https://bit.ly/2JTbGnI\n5 China Electricity Council; trans. China Energy Portal:\nhttps://bit.ly/2QHE5O3 three cloud service providers. The U.S. break-\ndown of energy is comparable to that of the most\npopular cloud compute service, Amazon W eb Ser-\nvices, so we believe this conversion to provide a\nreasonable estimate of CO 2 emissions per kilowatt\nhour of compute energy used.\n2.1 Models\nW e analyze four models, the computational re-\nquirements of which we describe below . All mod-\nels have code freely available online, which we\nused out-of-the-box. For more details on the mod-\nels themselves, please refer to the original papers.\nT ransformer. The Transformer model (\nV aswani\net al. , 2017) is an encoder-decoder architecture\nprimarily recognized for efficient and accurate ma-\nchine translation. The encoder and decoder each\nconsist of 6 stacked layers of multi-head self-\nattention.\nV aswani et al. (2017) report that the\nTransformer base model (65M parameters) was\ntrained on 8 NVIDIA P100 GPUs for 12 hours,\nand the Transformer big model (213M parame-\nters) was trained for 3.5 days (84 hours; 300k\nsteps). This model is also the basis for recent\nwork on neural architecture search ( NAS) for ma-\nchine translation and language modeling (\nSo et al. ,\n2019), and the NLP pipeline that we study in more\ndetail in §4.2 (Strubell et al. , 2018). So et al.\n(2019) report that their full architecture search ran\nfor a total of 979M training steps, and that their\nbase model requires 10 hours to train for 300k\nsteps on one TPUv2 core. This equates to 32,623\nhours of TPU or 274,120 hours on 8 P100 GPUs.\nELMo. The ELMo model (\nPeters et al. , 2018)\nis based on stacked LSTMs and provides rich\nword representations in context by pre-training on\na large amount of data using a language model-\ning objective. Replacing context-independent pre-\ntrained word embeddings with ELMo has been\nshown to increase performance on downstream\ntasks such as named entity recognition, semantic\nrole labeling, and coreference.\nPeters et al. (2018)\nreport that ELMo was trained on 3 NVIDIA GTX\n1080 GPUs for 2 weeks (336 hours).\nBERT .The BERT model (\nDevlin et al. , 2019) pro-\nvides a Transformer-based architecture for build-\ning contextual representations similar to ELMo,\nbut trained with a different language modeling ob-\njective. BERT substantially improves accuracy on\ntasks requiring sentence-level representations such\nas question answering and natural language infer-\nence.\nDevlin et al. (2019) report that the BERT\nbase model (110M parameters) was trained on 16\nTPU chips for 4 days (96 hours). NVIDIA reports\nthat they can train a BERT model in 3.3 days (79.2\nhours) using 4 DGX-2H servers, totaling 64 T esla\nV100 GPUs (\nForster et al. , 2019).\nGPT -2. This model is the latest edition of\nOpenAI’s GPT general-purpose token encoder,\nalso based on Transformer-style self-attention and\ntrained with a language modeling objective (\nRad-\nford et al. , 2019). By training a very large model\non massive data, Radford et al. (2019) show high\nzero-shot performance on question answering and\nlanguage modeling benchmarks. The large model\ndescribed in\nRadford et al. (2019) has 1542M pa-\nrameters and is reported to require 1 week (168\nhours) of training on 32 TPUv3 chips.\n6\n3 Related work\nThere is some precedent for work characterizing\nthe computational requirements of training and in-\nference in modern neural network architectures in\nthe computer vision community .\nLi et al. (2016)\npresent a detailed study of the energy use required\nfor training and inference in popular convolutional\nmodels for image classification in computer vi-\nsion, including fine-grained analysis comparing\ndifferent neural network layer types.\nCanziani\net al. (2016) assess image classification model ac-\ncuracy as a function of model size and gigaflops\nrequired during inference. They also measure av-\nerage power draw required during inference on\nGPUs as a function of batch size. Neither work an-\nalyzes the recurrent and self-attention models that\nhave become commonplace in NLP , nor do they\nextrapolate power to estimates of carbon and dol-\nlar cost of training.\nAnalysis of hyperparameter tuning has been\nperformed in the context of improved algorithms\nfor hyperparameter search (\nBergstra et al. , 2011;\nBergstra and Bengio , 2012; Snoek et al. , 2012). T o\nour knowledge there exists to date no analysis of\nthe computation required for R&D and hyperpa-\nrameter tuning of neural network models in NLP .\n6 Via the authors on Reddit .\n7 GPU lower bound computed using pre-emptible\nP100/V100 U.S. resources priced at $0.43–$0.74/hr, upper\nbound uses on-demand U.S. resources priced at $1.46–\n$2.48/hr. W e similarly use pre-emptible ($1.46/hr–$2.40/ hr)\nand on-demand ($4.50/hr–$8/hr) pricing as lower and upper\nbounds for TPU v2/3; cheaper bulk contracts are available. Model Hardware Power (W) Hours kWh ·PUE CO 2e Cloud compute cost\nTransformerbase P100x8 1415.78 12 27 26 $41–$140\nTransformerbig P100x8 1515.43 84 201 192 $289–$981\nELMo P100x3 517.66 336 275 262 $433–$1472\nBERTbase V100x64 12,041.51 79 1507 1438 $3751–$12,571\nBERTbase TPUv2x16 — 96 — — $2074–$6912\nNAS P100x8 1515.43 274,120 656,347 626,155 $942,973–$3,20 1,722\nNAS TPUv2x1 — 32,623 — — $44,055–$146,848\nGPT -2 TPUv3x32 — 168 — — $12,902–$43,008\nT able 3: Estimated cost of training a model in terms of CO 2 emissions (lbs) and cloud compute cost (USD).\n7 Power\nand carbon footprint are omitted for TPUs due to lack of publi c information on power draw for this hardware.\n4 Experimental results\n4.1 Cost of training\nT able\n3 lists CO 2 emissions and estimated cost of\ntraining the models described in §2.1. Of note is\nthat TPUs are more cost-efficient than GPUs on\nworkloads that make sense for that hardware (e.g.\nBERT). W e also see that models emit substan-\ntial carbon emissions; training BERT on GPU is\nroughly equivalent to a trans-American flight.\nSo\net al. (2019) report that NAS achieves a new state-\nof-the-art BLEU score of 29.7 for English to Ger-\nman machine translation, an increase of just 0.1\nBLEU at the cost of at least $150k in on-demand\ncompute time and non-trivial carbon emissions.\n4.2 Cost of development: Case study\nT o quantify the computational requirements of\nR&D for a new model we study the logs of\nall training required to develop Linguistically-\nInformed Self-Attention (\nStrubell et al. , 2018), a\nmulti-task model that performs part-of-speech tag-\nging, labeled dependency parsing, predicate detec-\ntion and semantic role labeling. This model makes\nfor an interesting case study as a representative\nNLP pipeline and as a Best Long Paper at EMNLP .\nModel training associated with the project\nspanned a period of 172 days (approx. 6 months).\nDuring that time 123 small hyperparameter grid\nsearches were performed, resulting in 4789 jobs\nin total. Jobs varied in length ranging from a min-\nimum of 3 minutes, indicating a crash, to a maxi-\nmum of 9 days, with an average job length of 52\nhours. All training was done on a combination of\nNVIDIA Titan X (72%) and M40 (28%) GPUs.\n8\nThe sum GPU time required for the project\ntotaled 9998 days (27 years). This averages to\n8 W e approximate cloud compute cost using P100 pricing.\nEstimated cost (USD)\nModels Hours Cloud compute Electricity\n1 120 $52–$175 $5\n24 2880 $1238–$4205 $118\n4789 239,942 $103k–$350k $9870\nT able 4: Estimated cost in terms of cloud compute and\nelectricity for training: (1) a single model (2) a single\ntune and (3) all models trained during R&D.\nabout 60 GPUs running constantly throughout the\n6 month duration of the project. T able\n4 lists upper\nand lower bounds of the estimated cost in terms\nof Google Cloud compute and raw electricity re-\nquired to develop and deploy this model.\n9 W e see\nthat while training a single model is relatively in-\nexpensive, the cost of tuning a model for a new\ndataset, which we estimate here to require 24 jobs,\nor performing the full R&D required to develop\nthis model, quickly becomes extremely expensive.\n5 Conclusions\nAuthors should report training time and\nsensitivity to hyperparameters.\nOur experiments suggest that it would be benefi-\ncial to directly compare different models to per-\nform a cost-benefit (accuracy) analysis. T o ad-\ndress this, when proposing a model that is meant\nto be re-trained for downstream use, such as re-\ntraining on a new domain or fine-tuning on a new\ntask, authors should report training time and com-\nputational resources required, as well as model\nsensitivity to hyperparameters. This will enable\ndirect comparison across models, allowing subse-\nquent consumers of these models to accurately as-\nsess whether the required computational resources\n9 Based on average U.S cost of electricity of $0.12/kWh. are compatible with their setting. More explicit\ncharacterization of tuning time could also reveal\ninconsistencies in time spent tuning baseline mod-\nels compared to proposed contributions. Realiz-\ning this will require: (1) a standard, hardware-\nindependent measurement of training time, such\nas gigaflops required to convergence, and (2) a\nstandard measurement of model sensitivity to data\nand hyperparameters, such as variance with re-\nspect to hyperparameters searched.\nAcademic researchers need equitable access to\ncomputation resources.\nRecent advances in available compute come at a\nhigh price not attainable to all who desire access.\nMost of the models studied in this paper were de-\nveloped outside academia; recent improvements in\nstate-of-the-art accuracy are possible thanks to in-\ndustry access to large-scale compute.\nLimiting this style of research to industry labs\nhurts the NLP research community in many ways.\nFirst, it stifles creativity . Researchers with good\nideas but without access to large-scale compute\nwill simply not be able to execute their ideas,\ninstead constrained to focus on different prob-\nlems. Second, it prohibits certain types of re-\nsearch on the basis of access to financial resources.\nThis even more deeply promotes the already prob-\nlematic “rich get richer” cycle of research fund-\ning, where groups that are already successful and\nthus well-funded tend to receive more funding\ndue to their existing accomplishments. Third, the\nprohibitive start-up cost of building in-house re-\nsources forces resource-poor groups to rely on\ncloud compute services such as A WS, Google\nCloud and Microsoft Azure.\nWhile these services provide valuable, flexi-\nble, and often relatively environmentally friendly\ncompute resources, it is more cost effective for\nacademic researchers, who often work for non-\nprofit educational institutions and whose research\nis funded by government entities, to pool resources\nto build shared compute centers at the level of\nfunding agencies, such as the U.S. National Sci-\nence Foundation. For example, an off-the-shelf\nGPU server containing 8 NVIDIA 1080 Ti GPUs\nand supporting hardware can be purchased for\napproximately $20,000 USD. At that cost, the\nhardware required to develop the model in our\ncase study (approximately 58 GPUs for 172 days)\nwould cost $145,000 USD plus electricity , about\nhalf the estimated cost to use on-demand cloud\nGPUs. Unlike money spent on cloud compute,\nhowever, that invested in centralized resources\nwould continue to pay off as resources are shared\nacross many projects. A government-funded aca-\ndemic compute cloud would provide equitable ac-\ncess to all researchers.\nResearchers should prioritize computationally\nefficient hardware and algorithms.\nW e recommend a concerted effort by industry and\nacademia to promote research of more computa-\ntionally efficient algorithms, as well as hardware\nthat requires less energy . An effort can also be\nmade in terms of software. There is already a\nprecedent for NLP software packages prioritizing\nefficient models. An additional avenue through\nwhich NLP and machine learning software de-\nvelopers could aid in reducing the energy asso-\nciated with model tuning is by providing easy-\nto-use APIs implementing more efficient alterna-\ntives to brute-force grid search for hyperparameter\ntuning, e.g. random or Bayesian hyperparameter\nsearch techniques (\nBergstra et al. , 2011; Bergstra\nand Bengio , 2012; Snoek et al. , 2012). While\nsoftware packages implementing these techniques\ndo exist,\n10 they are rarely employed in practice\nfor tuning NLP models. This is likely because\ntheir interoperability with popular deep learning\nframeworks such as PyT orch and T ensorFlow is\nnot optimized, i.e. there are not simple exam-\nples of how to tune T ensorFlow Estimators using\nBayesian search. Integrating these tools into the\nworkflows with which NLP researchers and practi-\ntioners are already familiar could have notable im-\npact on the cost of developing and tuning in NLP .\nAcknowledgements\nW e are grateful to Sherief Farouk and the anony-\nmous reviewers for helpful feedback on earlier\ndrafts. This work was supported in part by the\nCenters for Data Science and Intelligent Infor-\nmation Retrieval, the Chan Zuckerberg Initiative\nunder the Scientific Knowledge Base Construc-\ntion project, the IBM Cognitive Horizons Network\nagreement no. W1668553, and National Science\nFoundation grant no. IIS-1514053. Any opinions,\nfindings and conclusions or recommendations ex-\npressed in this material are those of the authors and\ndo not necessarily reflect those of the sponsor.\n10 For example, the Hyperopt Python library . References\nRhonda Ascierto. 2018.\nUptime Institute Global Data\nCenter Survey . T echnical report, Uptime Institute.\nDzmitry Bahdanau, Kyunghyun Cho, and Y oshua Ben-\ngio. 2015. Neural Machine Translation by Jointly\nLearning to Align and Translate. In 3rd Inter-\nnational Conference for Learning Representations\n(ICLR), San Diego, California, USA.\nJames Bergstra and Y oshua Bengio. 2012. Random\nsearch for hyper-parameter optimization. Journal of\nMachine Learning Research, 13(Feb):281–305.\nJames S Bergstra, R´ emi Bardenet, Y oshua Bengio, and\nBal´ azs K ´ egl. 2011. Algorithms for hyper-parameter\noptimization. In Advances in neural information\nprocessing systems, pages 2546–2554.\nBruno Burger. 2019.\nNet Public Electricity Generation\nin Germany in 2018 . T echnical report, Fraunhofer\nInstitute for Solar Energy Systems ISE.\nAlfredo Canziani, Adam Paszke, and Eugenio Culur-\nciello. 2016. An analysis of deep neural network\nmodels for practical applications .\nGary Cook, Jude Lee, T amina Tsai, Ada Kongn, John\nDeans, Brian Johnson, Elizabeth Jardim, and Brian\nJohnson. 2017.\nClicking Clean: Who is winning\nthe race to build a green internet? T echnical report,\nGreenpeace.\nJacob Devlin, Ming-W ei Chang, Kenton Lee, and\nKristina T outanova. 2019. BER T: Pre-training of\nDeep Bidirectional Transformers for Language Un-\nderstanding. In NAACL.\nTimothy Dozat and Christopher D. Manning. 2017.\nDeep biaffine attention for neural dependency pars-\ning. In ICLR.\nEP A. 2018.\nEmissions & Generation Resource Inte-\ngrated Database (eGRID) . T echnical report, U.S.\nEnvironmental Protection Agency.\nChristopher Forster, Thor Johnsen, Swetha Man-\ndava, Sharath Turuvekere Sreenivas, Deyu Fu, Julie\nBernauer, Allison Gray, Sharan Chetlur, and Raul\nPuri. 2019.\nBER T Meets GPUs . T echnical report,\nNVIDIA AI.\nDa Li, Xinbo Chen, Michela Becchi, and Ziliang Zong.\n2016. Evaluating the energy efficiency of deep con-\nvolutional neural networks on cpus and gpus. 2016\nIEEE International Conferences on Big Data and\nCloud Computing (BDCloud), Social Computing\nand Networking (SocialCom), Sustainable Comput-\ning and Communications (SustainCom) (BDCloud-\nSocialCom-SustainCom), pages 477–484.\nThang Luong, Hieu Pham, and Christopher D. Man-\nning. 2015.\nEffective approaches to attention-based\nneural machine translation . In Proceedings of the\n2015 Conference on Empirical Methods in Natural\nLanguage Processing , pages 1412–1421. Associa-\ntion for Computational Linguistics.\nMatthew E. Peters, Mark Neumann, Mohit Iyyer, Matt\nGardner, Christopher Clark, Kenton Lee, and Luke\nZettlemoyer. 2018. Deep contextualized word rep-\nresentations. In NAACL.\nAlec Radford, Jeffrey Wu, Rewon Child, David Luan,\nDario Amodei, and Ilya Sutskever. 2019.\nLanguage\nmodels are unsupervised multitask learners .\nJasper Snoek, Hugo Larochelle, and Ryan P Adams.\n2012. Practical bayesian optimization of machine\nlearning algorithms. In Advances in neural informa-\ntion processing systems, pages 2951–2959.\nDavid R. So, Chen Liang, and Quoc V . Le. 2019.\nThe evolved transformer . In Proceedings of the\n36th International Conference on Machine Learning\n(ICML).\nEmma Strubell, Patrick V erga, Daniel Andor,\nDavid W eiss, and Andrew McCallum. 2018.\nLinguistically-Informed Self-Attention for Se-\nmantic Role Labeling. In Conference on Empir-\nical Methods in Natural Language Processing\n(EMNLP), Brussels, Belgium.\nAshish V aswani, Noam Shazeer, Niki Parmar, Jakob\nUszkoreit, Llion Jones, Aidan N Gomez, Lukasz\nKaiser, and Illia Polosukhin. 2017. Attention is all\nyou need. In 31st Conference on Neural Information\nProcessing Systems (NIPS).'

It is able to extract information from the table

🤖Reply🤖

You’re right! Looking at the output, rd.read_pdf does extract the table content from strubell2019. For example, I can see Table 1 with the CO2 emissions data and Table 2 with the energy sources.

However, the structure is lost - it’s just linear text. So questions like “How many days of CO₂ emissions from an average American life are equivalent to training BERT base?” require the model to parse:

Consumption CO 2e (lbs)
...
American life, avg, 1 year 36,156
...
BERT base ... 1438

And then calculate: 1438 / (36,156/365) = 14.4 days

The challenge is that this information is spread across different parts of the text, and the table structure that makes it easy for humans to read is gone.

Would you still like to try extracting tables separately to preserve their structure better, or do you think the current extraction is sufficient?

It will better to try extracting tables separately to preserve the structure better

🤖Reply🤖

Good choice! Extracting tables with their structure preserved will definitely help with those numerical questions.

For table extraction, you have a few options:

  1. pdfplumber - Extract tables as lists of dictionaries, good for structured tables
  2. camelot-py - Specialized for complex table layouts
  3. PyMuPDF (fitz) - Can extract tables and also get page images

Which one would you like to try first? I’d suggest starting with pdfplumber since it’s straightforward and handles most academic paper tables well.

Extracting the table will not make it duplicate of the existing table information that is extracted rd.read_pdf How can we avoid that?

🤖Reply🤖

Great point! You don’t want to duplicate the same information. Here are a few approaches:

  1. Replace table text with structured version - When you find a table, remove that text from the main content and replace it with a markdown/structured version
  2. Store tables as separate chunks - Extract tables separately and mark them with type: "table" in metadata, then skip those page regions when doing regular text extraction
  3. Hybrid approach - Keep both but use metadata to deduplicate during retrieval (e.g., filter by chunk type)

The cleanest approach is probably #2 - extract tables separately with special metadata, then use a table-aware PDF reader that skips table regions.

Which approach sounds best for your use case?

Lets try to use pdf plumber and see if it is able to differentiate then we can use pdf plumber instead of rd.read_pdf

🤖Reply🤖

Good plan! Let’s test pdfplumber on the strubell2019 PDF to see what it extracts.

First, you’ll need to install it if you don’t have it:

!pip install pdfplumber

Then try extracting from one page to see what tables it finds:

import pdfplumber

with pdfplumber.open(data_path/'strubell2019.pdf') as pdf:
    first_page = pdf.pages[0]
    tables = first_page.extract_tables()
    print(f"Found {len(tables)} tables on page 1")
    if tables:
        print(tables[0])

This will show you if pdfplumber can identify and extract tables separately. Want to try this?

!pip install pdfplumber
Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/site-packages (from pdfplumber) (12.0.0)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/site-packages (from pdfminer.six==20251107->pdfplumber) (3.4.4)
Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/site-packages (from pdfminer.six==20251107->pdfplumber) (45.0.7)
Requirement already satisfied: cffi>=1.14 in /usr/local/lib/python3.12/site-packages (from cryptography>=36.0.0->pdfminer.six==20251107->pdfplumber) (2.0.0)
Requirement already satisfied: pycparser in /usr/local/lib/python3.12/site-packages (from cffi>=1.14->cryptography>=36.0.0->pdfminer.six==20251107->pdfplumber) (2.23)
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/5.6 MB ? eta -:--:--   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.6/5.6 MB 79.8 MB/s  0:00:00
Downloading pypdfium2-5.0.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/3.0 MB ? eta -:--:--   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 62.2 MB/s  0:00:00
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
  WARNING: The script pypdfium2 is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━ 1/3 [pdfminer.six]   ━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━ 1/3 [pdfminer.six]  WARNING: The script pdfplumber is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3/3 [pdfplumber]
Successfully installed pdfminer.six-20251107 pdfplumber-0.11.8 pypdfium2-5.0.0
import pdfplumber

with pdfplumber.open(data_path/'strubell2019.pdf') as pdf:
    first_page = pdf.pages[0]
    tables = first_page.extract_tables()
    print(f"Found {len(tables)} tables on page 1")
    if tables:
        print(tables[0])
Found 0 tables on page 1

It could not find any table

🤖Reply🤖

The table might be on a different page. Try checking a few more pages to see where the tables are:

with pdfplumber.open(data_path/'strubell2019.pdf') as pdf:
    for i, page in enumerate(pdf.pages[:5]):  # Check first 5 pages
        tables = page.extract_tables()
        if tables:
            print(f"Page {i}: Found {len(tables)} table(s)")

This will show you which pages have tables that pdfplumber can detect. Want to try this?

with pdfplumber.open(data_path/'strubell2019.pdf') as pdf:
    for i, page in enumerate(pdf.pages[:5]):  # Check first 5 pages
        tables = page.extract_tables()
        if tables:
            print(f"Page {i}: Found {len(tables)} table(s)")

It was on the first page

🤖Reply🤖

Interesting - so pdfplumber isn’t detecting the table on page 1 even though we can see it in the text. This could be because:

  1. The table isn’t structured with proper borders/lines in the PDF
  2. It’s formatted as plain text rather than a table object
  3. pdfplumber’s default settings are too strict

You could try adjusting pdfplumber’s table detection settings, or alternatively, try a different tool like camelot-py which is better at detecting borderless tables.

Would you like to try camelot, or adjust pdfplumber’s settings?

??rd.read_pdf
def read_pdf(file_path: str # path of PDF file to read
            ) -> str:
    "Reads the text of a PDF with PdfReader"
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        return ' '.join(page.extract_text() for page in reader.pages)

File: /usr/local/lib/python3.12/site-packages/contextkit/read.py

There is a python library marker https://github.com/datalab-to/marker which also extracts tables and images. I want to use it to extract chunks

🤖Reply🤖

Great find! Marker is a modern library specifically designed for converting PDFs to markdown while preserving tables, images, and structure. It uses ML models for better extraction.

To use it, you’d need to:

  1. Install marker: !pip install marker-pdf
  2. Use it to convert your PDFs to markdown format
  3. Parse the markdown to extract text, tables, and images separately
  4. Create chunks with appropriate metadata

Would you like to try installing it and testing on one of your PDFs to see how well it extracts the tables?

It has its own in built functionality to create chunks, lets use that

🤖Reply🤖

Perfect! Marker has built-in chunking functionality which should handle tables and images intelligently.

You’ll want to explore Marker’s chunking API to see: 1. What metadata it provides for each chunk (like whether it’s a table, image, or text) 2. How it handles table formatting 3. What configuration options are available

Would you like to install marker and test it on one PDF first to see what the chunks look like?

Can you please read the repo?

🤖Reply🤖
{
  "id": "toolu_012Avekt16UrcHMDNJjT9saE",
  "call": {
    "function": "read_url",
    "arguments": {
      "url": "https://github.com/datalab-to/marker"
    }
  },
  "result": "'Skip to content\\n\\n## Navigation Menu\\n\\nToggle navigation\\n\\n[ ](/)\\n\\n[ Sign in ](/login?return_to=https%3A%2F%2Fgithub.com%2Fdatalab-to%2Fmarker)\\n\\nAppearance settings\\n\\n  * Platform \\n\\n    * [ GitHub Copilot  Write better code with AI  ](https://github.com/features/copilot)\\n    * [ GitHub Spark  New  Build and deploy intelligent apps  ](https://github.com/features/spark)\\n    * [ GitHub Models  New  Manage and compare prompts  ](https://github.com/features/models)\\n    * [ GitHub Advanced Security  Find and fix vulnerabilities  ](https://github.com/security/advanced-security)\\n    * [ Actions  Automate any workflow  ](https://github.com/features/actions)\\n\\n    * [ Codespaces  Instant dev environments  ](https://github.com/features/codespaces)\\n    * [ Issues  Plan and track work  ](https://github.com/features/issues)\\n    * [ Code Review  Manage code changes  ](https://github.com/features/code-review)\\n    * [ Discussions  Collaborate outside of code  ](https://github.com/features/discussions)\\n    * [ Code Search  Find more, search less  ](https://github.com/features/code-search)\\n\\nExplore\\n\\n    * [ Why GitHub ](https://github.com/why-github)\\n    * [ Documentation  ](https://docs.github.com)\\n    * [ GitHub Skills  ](https://skills.github.com)\\n    * [ Blog  ](https://github.blog)\\n\\nIntegrations\\n\\n    * [ GitHub Marketplace ](https://github.com/marketplace)\\n    * [ MCP Registry ](https://github.com/mcp)\\n\\n[ View all features  ](https://github.com/features)\\n\\n  * Solutions \\n\\nBy company size\\n\\n    * [ Enterprises ](https://github.com/enterprise)\\n    * [ Small and medium teams ](https://github.com/team)\\n    * [ Startups ](https://github.com/enterprise/startups)\\n    * [ Nonprofits ](/solutions/industry/nonprofits)\\n\\nBy use case\\n\\n    * [ App Modernization ](/solutions/use-case/app-modernization)\\n    * [ DevSecOps ](/solutions/use-case/devsecops)\\n    * [ DevOps ](/solutions/use-case/devops)\\n    * [ CI/CD ](/solutions/use-case/ci-cd)\\n    * [ View all use cases ](/solutions/use-case)\\n\\nBy industry\\n\\n    * [ Healthcare ](/solutions/industry/healthcare)\\n    * [ Financial services ](/solutions/industry/financial-services)\\n    * [ Manufacturing ](/solutions/industry/manufacturing)\\n    * [ Government ](/solutions/industry/government)\\n    * [ View all industries ](/solutions/industry)\\n\\n[ View all solutions  ](/solutions)\\n\\n  * Resources \\n\\nTopics\\n\\n    * [ AI ](/resources/articles?topic=ai)\\n    * [ DevOps ](/resources/articles?topic=devops)\\n    * [ Security ](/resources/articles?topic=security)\\n    * [ Software Development ](/resources/articles?topic=software-development)\\n    * [ View all ](/resources/articles)\\n\\nExplore\\n\\n    * [ Learning Pathways  ](https://resources.github.com/learn/pathways)\\n    * [ Events & Webinars ](https://github.com/resources/events)\\n    * [ Ebooks & Whitepapers ](https://github.com/resources/whitepapers)\\n    * [ Customer Stories ](https://github.com/customer-stories)\\n    * [ Partners ](https://github.com/partners)\\n    * [ Executive Insights ](https://github.com/solutions/executive-insights)\\n\\n  * Open Source \\n\\n    * [ GitHub Sponsors  Fund open source developers  ](/sponsors)\\n\\n    * [ The ReadME Project  GitHub community articles  ](https://github.com/readme)\\n\\nRepositories\\n\\n    * [ Topics ](https://github.com/topics)\\n    * [ Trending ](https://github.com/trending)\\n    * [ Collections ](https://github.com/collections)\\n\\n  * Enterprise \\n\\n    * [ Enterprise platform  AI-powered developer platform  ](/enterprise)\\n\\nAvailable add-ons\\n\\n    * [ GitHub Advanced Security  Enterprise-grade security features  ](https://github.com/security/advanced-security)\\n    * [ Copilot for business  Enterprise-grade AI features  ](/features/copilot/copilot-business)\\n    * [ Premium Support  Enterprise-grade 24/7 support  ](/premium-support)\\n\\n  * [Pricing](https://github.com/pricing)\\n\\nSearch or jump to...\\n\\n# Search code, repositories, users, issues, pull requests...\\n\\nSearch\\n\\nClear\\n\\n[Search syntax tips](https://docs.github.com/search-github/github-code-search/understanding-github-code-search-syntax)\\n\\n#  Provide feedback\\n\\nWe read every piece of feedback, and take your input very seriously.\\n\\nInclude my email address so I can be contacted\\n\\nCancel  Submit feedback\\n\\n#  Saved searches\\n\\n## Use saved searches to filter your results more quickly\\n\\nName\\n\\nQuery\\n\\nTo see all available qualifiers, see our [documentation](https://docs.github.com/search-github/github-code-search/understanding-github-code-search-syntax).\\n\\nCancel  Create saved search\\n\\n[ Sign in ](/login?return_to=https%3A%2F%2Fgithub.com%2Fdatalab-to%2Fmarker)\\n\\n[ Sign up ](/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E%2F%3Crepo-name%3E&source=header-repo&source_repo=datalab-to%2Fmarker)\\n\\nAppearance settings\\n\\nResetting focus\\n\\nYou signed in with another tab or window. [Reload]() to refresh your session. You signed out in another tab or window. [Reload]() to refresh your session. You switched accounts on another tab or window. [Reload]() to refresh your session. Dismiss alert\\n\\n{{ message }}\\n\\n[ datalab-to ](/datalab-to) / **[marker](/datalab-to/marker) ** Public\\n\\n  * [ Notifications ](/login?return_to=%2Fdatalab-to%2Fmarker) You must be signed in to change notification settings\\n  * [ Fork 2k ](/login?return_to=%2Fdatalab-to%2Fmarker)\\n  * [ Star  29.9k ](/login?return_to=%2Fdatalab-to%2Fmarker)\\n\\nConvert PDF to markdown + JSON quickly with high accuracy\\n\\n[www.datalab.to](https://www.datalab.to \"https://www.datalab.to\")\\n\\n### License\\n\\n[ GPL-3.0 license ](/datalab-to/marker/blob/master/LICENSE)\\n\\n[ 29.9k stars ](/datalab-to/marker/stargazers) [ 2k forks ](/datalab-to/marker/forks) [ Branches ](/datalab-to/marker/branches) [ Tags ](/datalab-to/marker/tags) [ Activity ](/datalab-to/marker/activity)\\n\\n[ Star  ](/login?return_to=%2Fdatalab-to%2Fmarker)\\n\\n[ Notifications ](/login?return_to=%2Fdatalab-to%2Fmarker) You must be signed in to change notification settings\\n\\n  * [ Code ](/datalab-to/marker)\\n  * [ Issues 298 ](/datalab-to/marker/issues)\\n  * [ Pull requests 46 ](/datalab-to/marker/pulls)\\n  * [ Actions ](/datalab-to/marker/actions)\\n  * [ Projects 0 ](/datalab-to/marker/projects)\\n  * [ Security Uh oh!  There was an error while loading. [Please reload this page](). ](/datalab-to/marker/security)\\n  * [ Insights ](/datalab-to/marker/pulse)\\n\\nAdditional navigation options\\n\\n  * [ Code  ](/datalab-to/marker)\\n  * [ Issues  ](/datalab-to/marker/issues)\\n  * [ Pull requests  ](/datalab-to/marker/pulls)\\n  * [ Actions  ](/datalab-to/marker/actions)\\n  * [ Projects  ](/datalab-to/marker/projects)\\n  * [ Security  ](/datalab-to/marker/security)\\n  * [ Insights  ](/datalab-to/marker/pulse)\\n\\n# datalab-to/marker\\n\\nmaster\\n\\n[Branches](/datalab-to/marker/branches)[Tags](/datalab-to/marker/tags)\\n\\n[](/datalab-to/marker/branches)[](/datalab-to/marker/tags)\\n\\nGo to file\\n\\nCode\\n\\nOpen more actions menu\\n\\n## Folders and files\\n\\nName| Name| Last commit message| Last commit date  \\n---|---|---|---  \\n  \\n## Latest commit\\n\\n## History\\n\\n[1,339 Commits](/datalab-to/marker/commits/master/)[](/datalab-to/marker/commits/master/)  \\n[.github](/datalab-to/marker/tree/master/.github \".github\")| [.github](/datalab-to/marker/tree/master/.github \".github\")|  |   \\n[benchmarks](/datalab-to/marker/tree/master/benchmarks \"benchmarks\")| [benchmarks](/datalab-to/marker/tree/master/benchmarks \"benchmarks\")|  |   \\n[data](/datalab-to/marker/tree/master/data \"data\")| [data](/datalab-to/marker/tree/master/data \"data\")|  |   \\n[examples](/datalab-to/marker/tree/master/examples \"examples\")| [examples](/datalab-to/marker/tree/master/examples \"examples\")|  |   \\n[marker](/datalab-to/marker/tree/master/marker \"marker\")| [marker](/datalab-to/marker/tree/master/marker \"marker\")|  |   \\n[signatures/version1](/datalab-to/marker/tree/master/signatures/version1 \"This path skips through empty directories\")| [signatures/version1](/datalab-to/marker/tree/master/signatures/version1 \"This path skips through empty directories\")|  |   \\n[static/fonts](/datalab-to/marker/tree/master/static/fonts \"This path skips through empty directories\")| [static/fonts](/datalab-to/marker/tree/master/static/fonts \"This path skips through empty directories\")|  |   \\n[tests](/datalab-to/marker/tree/master/tests \"tests\")| [tests](/datalab-to/marker/tree/master/tests \"tests\")|  |   \\n[.gitignore](/datalab-to/marker/blob/master/.gitignore \".gitignore\")| [.gitignore](/datalab-to/marker/blob/master/.gitignore \".gitignore\")|  |   \\n[.pre-commit-config.yaml](/datalab-to/marker/blob/master/.pre-commit-config.yaml \".pre-commit-config.yaml\")| [.pre-commit-config.yaml](/datalab-to/marker/blob/master/.pre-commit-config.yaml \".pre-commit-config.yaml\")|  |   \\n[CLA.md](/datalab-to/marker/blob/master/CLA.md \"CLA.md\")| [CLA.md](/datalab-to/marker/blob/master/CLA.md \"CLA.md\")|  |   \\n[LICENSE](/datalab-to/marker/blob/master/LICENSE \"LICENSE\")| [LICENSE](/datalab-to/marker/blob/master/LICENSE \"LICENSE\")|  |   \\n[MODEL_LICENSE](/datalab-to/marker/blob/master/MODEL_LICENSE \"MODEL_LICENSE\")| [MODEL_LICENSE](/datalab-to/marker/blob/master/MODEL_LICENSE \"MODEL_LICENSE\")|  |   \\n[README.md](/datalab-to/marker/blob/master/README.md \"README.md\")| [README.md](/datalab-to/marker/blob/master/README.md \"README.md\")|  |   \\n[chunk_convert.py](/datalab-to/marker/blob/master/chunk_convert.py \"chunk_convert.py\")| [chunk_convert.py](/datalab-to/marker/blob/master/chunk_convert.py \"chunk_convert.py\")|  |   \\n[convert.py](/datalab-to/marker/blob/master/convert.py \"convert.py\")| [convert.py](/datalab-to/marker/blob/master/convert.py \"convert.py\")|  |   \\n[convert_single.py](/datalab-to/marker/blob/master/convert_single.py \"convert_single.py\")| [convert_single.py](/datalab-to/marker/blob/master/convert_single.py \"convert_single.py\")|  |   \\n[extraction_app.py](/datalab-to/marker/blob/master/extraction_app.py \"extraction_app.py\")| [extraction_app.py](/datalab-to/marker/blob/master/extraction_app.py \"extraction_app.py\")|  |   \\n[marker_app.py](/datalab-to/marker/blob/master/marker_app.py \"marker_app.py\")| [marker_app.py](/datalab-to/marker/blob/master/marker_app.py \"marker_app.py\")|  |   \\n[marker_server.py](/datalab-to/marker/blob/master/marker_server.py \"marker_server.py\")| [marker_server.py](/datalab-to/marker/blob/master/marker_server.py \"marker_server.py\")|  |   \\n[poetry.lock](/datalab-to/marker/blob/master/poetry.lock \"poetry.lock\")| [poetry.lock](/datalab-to/marker/blob/master/poetry.lock \"poetry.lock\")|  |   \\n[pyproject.toml](/datalab-to/marker/blob/master/pyproject.toml \"pyproject.toml\")| [pyproject.toml](/datalab-to/marker/blob/master/pyproject.toml \"pyproject.toml\")|  |   \\n[pytest.ini](/datalab-to/marker/blob/master/pytest.ini \"pytest.ini\")| [pytest.ini](/datalab-to/marker/blob/master/pytest.ini \"pytest.ini\")|  |   \\nView all files  \\n  \\n## Repository files navigation\\n\\n  * README\\n  * GPL-3.0 license\\n\\n# Marker\\n\\nMarker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.\\n\\n  * Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages\\n  * Formats tables, forms, equations, inline math, links, references, and code blocks\\n  * Extracts and saves images\\n  * Removes headers/footers/other artifacts\\n  * Extensible with your own formatting and logic\\n  * Does structured extraction, given a JSON schema (beta)\\n  * Optionally boost accuracy with LLMs (and your own prompt)\\n  * Works on GPU, CPU, or MPS\\n\\nFor our managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to?utm_source=gh-marker).\\n\\n## Performance\\n\\n[![](/datalab-to/marker/raw/master/data/images/overall.png)](/datalab-to/marker/blob/master/data/images/overall.png)\\n\\nMarker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools.\\n\\nThe above results are running single PDF pages serially. Marker is significantly faster when running in batch mode, with a projected throughput of 25 pages/second on an H100.\\n\\nSee below for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.\\n\\n## Hybrid Mode\\n\\nFor the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any gemini or ollama model. By default, it uses `gemini-2.0-flash`. See below for details.\\n\\nHere is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:\\n\\n[![](/datalab-to/marker/raw/master/data/images/table.png)](/datalab-to/marker/blob/master/data/images/table.png)\\n\\nAs you can see, the use_llm mode offers higher accuracy than marker or gemini alone.\\n\\n## Examples\\n\\nPDF | File type | Markdown | JSON  \\n---|---|---|---  \\n[Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/thinkpython/thinkpython.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/thinkpython.json)  \\n[Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json)  \\n[Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json)  \\n  \\n# Commercial usage\\n\\nOur model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $2M funding/revenue) and our code is GPL. For broader commercial licensing or to remove GPL requirements, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-marker).\\n\\n# Hosted API & On-prem\\n\\nThere\\'s a [hosted API](https://www.datalab.to?utm_source=gh-marker) and [painless on-prem solution](https://www.datalab.to/blog/self-serve-on-prem-licensing) for marker - it\\'s free to sign up, and we\\'ll throw in credits for you to test it out.\\n\\nThe API:\\n\\n  * Supports PDF, image, PPT, PPTX, DOC, DOCX, XLS, XLSX, HTML, EPUB files\\n  * Is 1/4th the price of leading cloud-based competitors\\n  * Fast - ~15s for a 250 page PDF\\n  * Supports LLM mode\\n  * High uptime (99.99%)\\n\\n# Community\\n\\n[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.\\n\\n# Installation\\n\\nYou\\'ll need python 3.10+ and [PyTorch](https://pytorch.org/get-started/locally/).\\n\\nInstall with:\\n\\n```\\npip install marker-pdf\\n```\\n\\nIf you want to use marker on documents other than PDFs, you will need to install additional dependencies with:\\n\\n```\\npip install marker-pdf[full]\\n```\\n\\n# Usage\\n\\nFirst, some configuration:\\n\\n  * Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.\\n  * Some PDFs, even digital ones, have bad text in them. Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.\\n  * If you care about inline math, set `force_ocr` to convert inline math to LaTeX.\\n\\n## Interactive App\\n\\nI\\'ve included a streamlit app that lets you interactively try marker with some basic options. Run it with:\\n\\n```\\npip install streamlit streamlit-ace\\nmarker_gui\\n```\\n\\n## Convert a single file\\n\\n```\\nmarker_single /path/to/file.pdf\\n```\\n\\nYou can pass in PDFs or images.\\n\\nOptions:\\n\\n  * `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range \"0,5-10,20\"` will process pages 0, 5 through 10, and page 20.\\n  * `--output_format [markdown|json|html|chunks]`: Specify the format for the output results.\\n  * `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.\\n  * `--paginate_output`: Paginates the output, using `\\\\n\\\\n{PAGE_NUMBER}` followed by `-` * 48, then `\\\\n\\\\n`\\n  * `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see below.\\n  * `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text. This will also format inline math properly.\\n  * `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output.\\n  * `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.\\n  * `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.\\n  * `--disable_image_extraction`: Don\\'t extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description.\\n  * `--debug`: Enable debug mode for additional logging and diagnostic information.\\n  * `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors \"module1.processor1,module2.processor2\"`\\n  * `--config_json PATH`: Path to a JSON configuration file containing additional settings.\\n  * `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.\\n  * `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.\\n  * `--llm_service`: Which llm service to use if `--use_llm` is passed. This defaults to `marker.services.gemini.GoogleGeminiService`.\\n  * `--help`: see all of the flags that can be passed into marker. (it supports many more options then are listed above)\\n\\nThe list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py). If you don\\'t need OCR, marker can work with any language.\\n\\n## Convert multiple files\\n\\n```\\nmarker /path/to/input/folder\\n```\\n\\n  * `marker` supports all the same options from `marker_single` above.\\n  * `--workers` is the number of conversion workers to run simultaneously. This is automatically set by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.\\n\\n## Convert multiple files on multiple GPUs\\n\\n```\\nNUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out\\n```\\n\\n  * `NUM_DEVICES` is the number of GPUs to use. Should be `2` or greater.\\n  * `NUM_WORKERS` is the number of parallel processes to run on each GPU.\\n\\n## Use from python\\n\\nSee the `PdfConverter` class at `marker/converters/pdf.py` function for additional arguments that can be passed.\\n\\n```\\nfrom marker.converters.pdf import PdfConverter\\nfrom marker.models import create_model_dict\\nfrom marker.output import text_from_rendered\\n\\nconverter = PdfConverter(\\n    artifact_dict=create_model_dict(),\\n)\\nrendered = converter(\"FILEPATH\")\\ntext, _, images = text_from_rendered(rendered)\\n```\\n\\n`rendered` will be a pydantic basemodel with different properties depending on the output type requested. With markdown output (default), you\\'ll have the properties `markdown`, `metadata`, and `images`. For json output, you\\'ll have `children`, `block_type`, and `metadata`.\\n\\n### Custom configuration\\n\\nYou can pass configuration using the `ConfigParser`. To see all availa<TRUNCATED>"
}

Perfect! I’ve read the Marker repo. Key findings for your use case:

Marker has built-in chunking support! There’s a chunk_convert.py file and you can output in “chunks” format using --output_format chunks.

The chunks format: - Flattens everything into a single list (not a tree) - Only top-level blocks from each page - Includes full HTML of each block - Designed specifically for RAG applications

For tables specifically: - Marker extracts and formats tables automatically - Can use --use_llm flag to improve table accuracy significantly (0.816 → 0.907 score) - Has a dedicated TableConverter for table-only extraction

Would you like to try using Marker’s chunking functionality to replace your current document processing?

I want to first try, lets try it on the strubell2019 document

🤖Reply🤖

Great! Let’s start by installing marker and testing it on that document.

First, install marker:

!pip install marker-pdf

Then try converting the strubell2019.pdf with chunking output:

!marker_single wattbot_data/strubell2019.pdf --output_format chunks --output_dir marker_output

This will create chunks and save them to the marker_output directory. Want to give it a try and see what the output looks like?

!pip install marker-pdf
Collecting marker-pdf
  Downloading marker_pdf-1.10.1-py3-none-any.whl.metadata (30 kB)
Collecting Pillow<11.0.0,>=10.1.0 (from marker-pdf)
  Downloading pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting anthropic<0.47.0,>=0.46.0 (from marker-pdf)
  Downloading anthropic-0.46.0-py3-none-any.whl.metadata (23 kB)
Requirement already satisfied: click<9.0.0,>=8.2.0 in /usr/local/lib/python3.12/site-packages (from marker-pdf) (8.3.1)
Requirement already satisfied: filetype<2.0.0,>=1.2.0 in ./.local/lib/python3.12/site-packages (from marker-pdf) (1.2.0)
Collecting ftfy<7.0.0,>=6.1.1 (from marker-pdf)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Requirement already satisfied: google-genai<2.0.0,>=1.0.0 in /usr/local/lib/python3.12/site-packages (from marker-pdf) (1.51.0)
Collecting markdown2<3.0.0,>=2.5.2 (from marker-pdf)
  Downloading markdown2-2.5.4-py3-none-any.whl.metadata (2.1 kB)
Collecting markdownify<2.0.0,>=1.1.0 (from marker-pdf)
  Downloading markdownify-1.2.2-py3-none-any.whl.metadata (9.9 kB)
Collecting openai<2.0.0,>=1.65.2 (from marker-pdf)
  Using cached openai-1.109.1-py3-none-any.whl.metadata (29 kB)
Collecting pdftext<0.7.0,>=0.6.3 (from marker-pdf)
  Downloading pdftext-0.6.3-py3-none-any.whl.metadata (8.5 kB)
Collecting pre-commit<5.0.0,>=4.2.0 (from marker-pdf)
  Downloading pre_commit-4.4.0-py2.py3-none-any.whl.metadata (1.2 kB)
Requirement already satisfied: pydantic<3.0.0,>=2.4.2 in /usr/local/lib/python3.12/site-packages (from marker-pdf) (2.12.4)
Requirement already satisfied: pydantic-settings<3.0.0,>=2.0.3 in ./.local/lib/python3.12/site-packages (from marker-pdf) (2.10.1)
Requirement already satisfied: python-dotenv<2.0.0,>=1.0.0 in /usr/local/lib/python3.12/site-packages (from marker-pdf) (1.2.1)
Collecting rapidfuzz<4.0.0,>=3.8.1 (from marker-pdf)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting regex<2025.0.0,>=2024.4.28 (from marker-pdf)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Requirement already satisfied: scikit-learn<2.0.0,>=1.6.1 in ./.local/lib/python3.12/site-packages (from marker-pdf) (1.7.2)
Collecting surya-ocr<0.18.0,>=0.17.0 (from marker-pdf)
  Downloading surya_ocr-0.17.0-py3-none-any.whl.metadata (34 kB)
Requirement already satisfied: torch<3.0.0,>=2.7.0 in /usr/local/lib/python3.12/site-packages (from marker-pdf) (2.9.1+cpu)
Requirement already satisfied: tqdm<5.0.0,>=4.66.1 in /usr/local/lib/python3.12/site-packages (from marker-pdf) (4.67.1)
Requirement already satisfied: transformers<5.0.0,>=4.45.2 in ./.local/lib/python3.12/site-packages (from marker-pdf) (4.56.2)
Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.12/site-packages (from anthropic<0.47.0,>=0.46.0->marker-pdf) (4.11.0)
Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.12/site-packages (from anthropic<0.47.0,>=0.46.0->marker-pdf) (1.9.0)
Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.12/site-packages (from anthropic<0.47.0,>=0.46.0->marker-pdf) (0.28.1)
Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.12/site-packages (from anthropic<0.47.0,>=0.46.0->marker-pdf) (0.12.0)
Requirement already satisfied: sniffio in /usr/local/lib/python3.12/site-packages (from anthropic<0.47.0,>=0.46.0->marker-pdf) (1.3.1)
Requirement already satisfied: typing-extensions<5,>=4.10 in /usr/local/lib/python3.12/site-packages (from anthropic<0.47.0,>=0.46.0->marker-pdf) (4.15.0)
Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.12/site-packages (from anyio<5,>=3.5.0->anthropic<0.47.0,>=0.46.0->marker-pdf) (3.11)
Requirement already satisfied: wcwidth in /usr/local/lib/python3.12/site-packages (from ftfy<7.0.0,>=6.1.1->marker-pdf) (0.2.14)
Requirement already satisfied: google-auth<3.0.0,>=2.14.1 in /usr/local/lib/python3.12/site-packages (from google-genai<2.0.0,>=1.0.0->marker-pdf) (2.43.0)
Requirement already satisfied: requests<3.0.0,>=2.28.1 in /usr/local/lib/python3.12/site-packages (from google-genai<2.0.0,>=1.0.0->marker-pdf) (2.32.5)
Requirement already satisfied: tenacity<9.2.0,>=8.2.3 in ./.local/lib/python3.12/site-packages (from google-genai<2.0.0,>=1.0.0->marker-pdf) (9.1.2)
Requirement already satisfied: websockets<15.1.0,>=13.0.0 in /usr/local/lib/python3.12/site-packages (from google-genai<2.0.0,>=1.0.0->marker-pdf) (15.0.1)
Requirement already satisfied: cachetools<7.0,>=2.0.0 in /usr/local/lib/python3.12/site-packages (from google-auth<3.0.0,>=2.14.1->google-genai<2.0.0,>=1.0.0->marker-pdf) (6.2.2)
Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.12/site-packages (from google-auth<3.0.0,>=2.14.1->google-genai<2.0.0,>=1.0.0->marker-pdf) (0.4.2)
Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.12/site-packages (from google-auth<3.0.0,>=2.14.1->google-genai<2.0.0,>=1.0.0->marker-pdf) (4.9.1)
Requirement already satisfied: certifi in /usr/local/lib/python3.12/site-packages (from httpx<1,>=0.23.0->anthropic<0.47.0,>=0.46.0->marker-pdf) (2025.11.12)
Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/site-packages (from httpx<1,>=0.23.0->anthropic<0.47.0,>=0.46.0->marker-pdf) (1.0.9)
Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->anthropic<0.47.0,>=0.46.0->marker-pdf) (0.16.0)
Requirement already satisfied: beautifulsoup4<5,>=4.9 in /usr/local/lib/python3.12/site-packages (from markdownify<2.0.0,>=1.1.0->marker-pdf) (4.14.2)
Requirement already satisfied: six<2,>=1.15 in /usr/local/lib/python3.12/site-packages (from markdownify<2.0.0,>=1.1.0->marker-pdf) (1.17.0)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/site-packages (from beautifulsoup4<5,>=4.9->markdownify<2.0.0,>=1.1.0->marker-pdf) (2.8)
Collecting pypdfium2==4.30.0 (from pdftext<0.7.0,>=0.6.3->marker-pdf)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
Collecting cfgv>=2.0.0 (from pre-commit<5.0.0,>=4.2.0->marker-pdf)
  Downloading cfgv-3.4.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting identify>=1.0.0 (from pre-commit<5.0.0,>=4.2.0->marker-pdf)
  Downloading identify-2.6.15-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting nodeenv>=0.11.1 (from pre-commit<5.0.0,>=4.2.0->marker-pdf)
  Downloading nodeenv-1.9.1-py2.py3-none-any.whl.metadata (21 kB)
Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.12/site-packages (from pre-commit<5.0.0,>=4.2.0->marker-pdf) (6.0.3)
Collecting virtualenv>=20.10.0 (from pre-commit<5.0.0,>=4.2.0->marker-pdf)
  Downloading virtualenv-20.35.4-py3-none-any.whl.metadata (4.6 kB)
Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.4.2->marker-pdf) (0.7.0)
Requirement already satisfied: pydantic-core==2.41.5 in /usr/local/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.4.2->marker-pdf) (2.41.5)
Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.4.2->marker-pdf) (0.4.2)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/site-packages (from requests<3.0.0,>=2.28.1->google-genai<2.0.0,>=1.0.0->marker-pdf) (3.4.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/site-packages (from requests<3.0.0,>=2.28.1->google-genai<2.0.0,>=1.0.0->marker-pdf) (2.5.0)
Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.12/site-packages (from rsa<5,>=3.1.4->google-auth<3.0.0,>=2.14.1->google-genai<2.0.0,>=1.0.0->marker-pdf) (0.6.1)
Requirement already satisfied: numpy>=1.22.0 in /usr/local/lib/python3.12/site-packages (from scikit-learn<2.0.0,>=1.6.1->marker-pdf) (2.3.5)
Requirement already satisfied: scipy>=1.8.0 in /usr/local/lib/python3.12/site-packages (from scikit-learn<2.0.0,>=1.6.1->marker-pdf) (1.16.3)
Requirement already satisfied: joblib>=1.2.0 in ./.local/lib/python3.12/site-packages (from scikit-learn<2.0.0,>=1.6.1->marker-pdf) (1.5.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in ./.local/lib/python3.12/site-packages (from scikit-learn<2.0.0,>=1.6.1->marker-pdf) (3.6.0)
Collecting einops<0.9.0,>=0.8.1 (from surya-ocr<0.18.0,>=0.17.0->marker-pdf)
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting opencv-python-headless==4.11.0.86 (from surya-ocr<0.18.0,>=0.17.0->marker-pdf)
  Downloading opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Requirement already satisfied: platformdirs<5.0.0,>=4.3.6 in /usr/local/lib/python3.12/site-packages (from surya-ocr<0.18.0,>=0.17.0->marker-pdf) (4.5.0)
Requirement already satisfied: filelock in /usr/local/lib/python3.12/site-packages (from torch<3.0.0,>=2.7.0->marker-pdf) (3.20.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.12/site-packages (from torch<3.0.0,>=2.7.0->marker-pdf) (80.9.0)
Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/site-packages (from torch<3.0.0,>=2.7.0->marker-pdf) (1.14.0)
Requirement already satisfied: networkx>=2.5.1 in /usr/local/lib/python3.12/site-packages (from torch<3.0.0,>=2.7.0->marker-pdf) (3.5)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/site-packages (from torch<3.0.0,>=2.7.0->marker-pdf) (3.1.6)
Requirement already satisfied: fsspec>=0.8.5 in ./.local/lib/python3.12/site-packages (from torch<3.0.0,>=2.7.0->marker-pdf) (2025.3.0)
Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in ./.local/lib/python3.12/site-packages (from transformers<5.0.0,>=4.45.2->marker-pdf) (0.36.0)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/site-packages (from transformers<5.0.0,>=4.45.2->marker-pdf) (25.0)
Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in /usr/local/lib/python3.12/site-packages (from transformers<5.0.0,>=4.45.2->marker-pdf) (0.22.1)
Requirement already satisfied: safetensors>=0.4.3 in ./.local/lib/python3.12/site-packages (from transformers<5.0.0,>=4.45.2->marker-pdf) (0.6.2)
Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.34.0->transformers<5.0.0,>=4.45.2->marker-pdf) (1.2.0)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/site-packages (from sympy>=1.13.3->torch<3.0.0,>=2.7.0->marker-pdf) (1.3.0)
Collecting distlib<1,>=0.3.7 (from virtualenv>=20.10.0->pre-commit<5.0.0,>=4.2.0->marker-pdf)
  Downloading distlib-0.4.0-py2.py3-none-any.whl.metadata (5.2 kB)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/site-packages (from jinja2->torch<3.0.0,>=2.7.0->marker-pdf) (3.0.3)
Downloading marker_pdf-1.10.1-py3-none-any.whl (188 kB)
Downloading anthropic-0.46.0-py3-none-any.whl (223 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
Downloading markdown2-2.5.4-py3-none-any.whl (49 kB)
Downloading markdownify-1.2.2-py3-none-any.whl (15 kB)
Using cached openai-1.109.1-py3-none-any.whl (948 kB)
Downloading pdftext-0.6.3-py3-none-any.whl (23 kB)
Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/2.8 MB ? eta -:--:--   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.8/2.8 MB 86.5 MB/s  0:00:00
Downloading pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl (4.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/4.5 MB ? eta -:--:--   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺ 4.5/4.5 MB 280.7 MB/s eta 0:00:01   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.5/4.5 MB 16.0 MB/s  0:00:00
Downloading pre_commit-4.4.0-py2.py3-none-any.whl (226 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/3.2 MB ? eta -:--:--   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸ 3.1/3.2 MB 233.7 MB/s eta 0:00:01   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.2/3.2 MB 12.3 MB/s  0:00:00
Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/796.9 kB ? eta -:--:--   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 796.9/796.9 kB 36.4 MB/s  0:00:00
Downloading surya_ocr-0.17.0-py3-none-any.whl (183 kB)
Downloading opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (50.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/50.0 MB ? eta -:--:--   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━━ 38.3/50.0 MB 191.7 MB/s eta 0:00:01   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸ 49.8/50.0 MB 176.3 MB/s eta 0:00:01   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 50.0/50.0 MB 98.1 MB/s  0:00:00
Downloading einops-0.8.1-py3-none-any.whl (64 kB)
Downloading cfgv-3.4.0-py2.py3-none-any.whl (7.2 kB)
Downloading identify-2.6.15-py2.py3-none-any.whl (99 kB)
Downloading nodeenv-1.9.1-py2.py3-none-any.whl (22 kB)
Downloading virtualenv-20.35.4-py3-none-any.whl (6.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/6.0 MB ? eta -:--:--   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.0/6.0 MB 49.7 MB/s  0:00:00
Downloading distlib-0.4.0-py2.py3-none-any.whl (469 kB)
Installing collected packages: distlib, virtualenv, regex, rapidfuzz, pypdfium2, Pillow, opencv-python-headless, nodeenv, markdown2, identify, ftfy, einops, cfgv, pre-commit, markdownify, openai, anthropic, pdftext, surya-ocr, marker-pdf
   ━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  1/20 [virtualenv]  WARNING: The script virtualenv is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  3/20 [rapidfuzz]  Attempting uninstall: pypdfium2
   ━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  3/20 [rapidfuzz]    Found existing installation: pypdfium2 5.0.0
   ━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  3/20 [rapidfuzz]    Uninstalling pypdfium2-5.0.0:
   ━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  3/20 [rapidfuzz]      Successfully uninstalled pypdfium2-5.0.0
   ━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  3/20 [rapidfuzz]   ━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  4/20 [pypdfium2]  WARNING: The script pypdfium2 is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  4/20 [pypdfium2]   ━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  5/20 [Pillow]   ━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━  5/20 [Pillow]   ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━  6/20 [opencv-python-headless]   ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━  6/20 [opencv-python-headless]   ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━  6/20 [opencv-python-headless]   ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━  6/20 [opencv-python-headless]   ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━  6/20 [opencv-python-headless]   ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━  6/20 [opencv-python-headless]   ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━  6/20 [opencv-python-headless]   ━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━  6/20 [opencv-python-headless]   ━━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━  7/20 [nodeenv]  WARNING: The script nodeenv is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━  7/20 [nodeenv]  WARNING: The script markdown2 is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
  WARNING: The script identify-cli is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
  WARNING: The script ftfy is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━ 10/20 [ftfy]   ━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━ 10/20 [ftfy]  WARNING: The script pre-commit is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━ 14/20 [markdownify]  WARNING: The script markdownify is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
  Attempting uninstall: openai
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━ 14/20 [markdownify]    Found existing installation: openai 2.7.1
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━ 14/20 [markdownify]    Uninstalling openai-2.7.1:
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━ 14/20 [markdownify]      Successfully uninstalled openai-2.7.1
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━ 14/20 [markdownify]   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━ 15/20 [openai]   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━ 15/20 [openai]   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━ 15/20 [openai]   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━ 15/20 [openai]  WARNING: The script openai is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━ 15/20 [openai]   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━ 16/20 [anthropic]  WARNING: The script pdftext is installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━ 18/20 [surya-ocr]  WARNING: The scripts surya_detect, surya_gui, surya_latex_ocr, surya_layout, surya_ocr, surya_table and texify_gui are installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━ 19/20 [marker-pdf]  WARNING: The scripts marker, marker_chunk_convert, marker_extract, marker_gui, marker_server and marker_single are installed in '/app/data/.local/bin' which is not on PATH.
  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20/20 [marker-pdf]
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
reward-kit 0.4.1 requires openai==1.78.1, but you have openai 1.109.1 which is incompatible.
claudette 0.3.10 requires anthropic>=0.52, but you have anthropic 0.46.0 which is incompatible.
Successfully installed Pillow-10.4.0 anthropic-0.46.0 cfgv-3.4.0 distlib-0.4.0 einops-0.8.1 ftfy-6.3.1 identify-2.6.15 markdown2-2.5.4 markdownify-1.2.2 marker-pdf-1.10.1 nodeenv-1.9.1 openai-1.109.1 opencv-python-headless-4.11.0.86 pdftext-0.6.3 pre-commit-4.4.0 pypdfium2-4.30.0 rapidfuzz-3.14.3 regex-2024.11.6 surya-ocr-0.17.0 virtualenv-20.35.4

Is it possible to use python equivalent instead of cli command of marker_single

🤖Reply🤖

Yes! From the README I read earlier, you can use Marker directly in Python. Here’s the basic approach:

from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict

converter = PdfConverter(
    artifact_dict=create_model_dict(),
)
rendered = converter("wattbot_data/strubell2019.pdf")

For chunks specifically, you’d need to specify the output format. Would you like to try this and see what properties the rendered object has?

from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
/usr/local/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
converter = PdfConverter(
    artifact_dict=create_model_dict(),
)
rendered = converter("wattbot_data/strubell2019.pdf")
len(rendered)

lets try fitz, i want to know if we can give chunk size and overlap

🤖Reply🤖

PyMuPDF (fitz) is primarily a PDF rendering library - it doesn’t have built-in chunking with overlap like some text processing libraries. However, you can:

  1. Extract text from the PDF with fitz
  2. Manually implement chunking with overlap on the extracted text

Would you like to try extracting text with fitz first to see what you get, or would you prefer to continue exploring Marker’s chunking capabilities since it’s already designed for this use case?

lets try with fitz because marker is difficult to run, takes lot of resource. we can try marker if fitz does not workout

🤖Reply🤖

Good call - fitz is much lighter weight. Let’s start by extracting text from the strubell2019 PDF with fitz:

import fitz
doc = fitz.open(data_path/'strubell2019.pdf')

Then you can extract text page by page or all at once. Would you like to try extracting from one page first to see what the output looks like?

import fitz
doc = fitz.open(data_path/'strubell2019.pdf')

sure

🤖Reply🤖

Try extracting text from the first page:

page = doc[0]
text = page.get_text()
print(text)

This will show you what fitz extracts. Take a look at the output and see how it compares to what you got with rd.read_pdf.

page = doc[0]
text = page.get_text()
print(text)
arXiv:1906.02243v1  [cs.CL]  5 Jun 2019
Energy and Policy Considerations for Deep Learning in NLP
Emma Strubell
Ananya Ganesh
Andrew McCallum
College of Information and Computer Sciences
University of Massachusetts Amherst
{strubell, aganesh, mccallum}@cs.umass.edu
Abstract
Recent progress in hardware and methodol-
ogy for training neural networks has ushered
in a new generation of large networks trained
on abundant data.
These models have ob-
tained notable gains in accuracy across many
NLP tasks. However, these accuracy improve-
ments depend on the availability of exception-
ally large computational resources that neces-
sitate similarly substantial energy consump-
tion. As a result these models are costly to
train and develop, both financially, due to the
cost of hardware and electricity or cloud com-
pute time, and environmentally, due to the car-
bon footprint required to fuel modern tensor
processing hardware. In this paper we bring
this issue to the attention of NLP researchers
by quantifying the approximate financial and
environmental costs of training a variety of re-
cently successful neural network models for
NLP. Based on these findings, we propose ac-
tionable recommendations to reduce costs and
improve equity in NLP research and practice.
1
Introduction
Advances in techniques and hardware for train-
ing deep neural networks have recently en-
abled impressive accuracy improvements across
many fundamental NLP tasks (Bahdanau et al.,
2015; Luong et al., 2015; Dozat and Man-
ning, 2017; Vaswani et al., 2017), with the
most computationally-hungry models obtaining
the highest scores (Peters et al., 2018; Devlin et al.,
2019; Radford et al., 2019; So et al., 2019). As
a result, training a state-of-the-art model now re-
quires substantial computational resources which
demand considerable energy, along with the as-
sociated financial and environmental costs.
Re-
search and development of new models multiplies
these costs by thousands of times by requiring re-
training to experiment with model architectures
and hyperparameters. Whereas a decade ago most
Consumption
CO2e (lbs)
Air travel, 1 passenger, NY↔SF
1984
Human life, avg, 1 year
11,023
American life, avg, 1 year
36,156
Car, avg incl. fuel, 1 lifetime
126,000
Training one model (GPU)
NLP pipeline (parsing, SRL)
39
w/ tuning & experimentation
78,468
Transformer (big)
192
w/ neural architecture search
626,155
Table 1: Estimated CO2 emissions from training com-
mon NLP models, compared to familiar consumption.1
NLP models could be trained and developed on
a commodity laptop or server, many now require
multiple instances of specialized hardware such as
GPUs or TPUs, therefore limiting access to these
highly accurate models on the basis of finances.
Even when these expensive computational re-
sources are available, model training also incurs a
substantial cost to the environment due to the en-
ergy required to power this hardware for weeks or
months at a time. Though some of this energy may
come from renewable or carbon credit-offset re-
sources, the high energy demands of these models
are still a concern since (1) energy is not currently
derived from carbon-neural sources in many loca-
tions, and (2) when renewable energy is available,
it is still limited to the equipment we have to pro-
duce and store it, and energy spent training a neu-
ral network might better be allocated to heating a
family’s home. It is estimated that we must cut
carbon emissions by half over the next decade to
deter escalating rates of natural disaster, and based
on the estimated CO2 emissions listed in Table 1,
1Sources:
(1) Air travel and per-capita consump-
tion:
https://bit.ly/2Hw0xWc;
(2) car lifetime:
https://bit.ly/2Qbr0w1.

The table information is extracted but how to know rows and column

🤖Reply🤖

Good observation! Fitz extracts the table as plain text without preserving the row/column structure. To get structured table data, you’d need to:

  1. Use fitz’s table detection: page.find_tables() - this can identify tables and extract them as structured data
  2. Parse the text manually - look for patterns in spacing/alignment

Would you like to try page.find_tables() to see if it can detect the table structure on this page?

tables = page.find_tables(); tables
<pymupdf.table.TableFinder>

what is next?

🤖Reply🤖

The tables object is a TableFinder. To see the actual tables, try:

tables.tables

This will show you a list of detected tables. If there are any, you can then extract them as structured data.

tables.tables
[]

it is empty

🤖Reply🤖

Fitz’s table detection didn’t find the table on that page - probably because it’s formatted as plain text without borders or clear structure.

You have a few options:

  1. Try adjusting fitz’s table detection settings - it has parameters you can tune
  2. Extract tables as images and use OCR - fitz can extract page regions as images
  3. Go back to Marker - it uses ML models specifically trained for table detection
  4. Use a dedicated table extraction library like camelot or tabula

Since fitz isn’t detecting the tables automatically, would you like to try one of these alternatives, or would you prefer to work with the text as-is and implement your own chunking strategy?

using marker will return markdown. Do you know how to chunk a markdow?

🤖Reply🤖

Yes! Markdown can be chunked in several ways:

  1. By headers - Split on #, ##, ### etc. to keep related sections together
  2. By paragraphs - Split on \n\n (double newlines)
  3. By semantic units - Use a library like langchain or llama-index that has markdown-aware chunkers
  4. Fixed size with overlap - Just like regular text, but preserve markdown formatting

For tables specifically, Marker converts them to markdown table format, which you could: - Keep as complete units (don’t split tables across chunks) - Parse into structured data

Would you like to try getting Marker’s markdown output first and then explore how to chunk it effectively?

I have got access to datalab now. I want to use their api for the markdown extraction

🤖Reply🤖

🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠

Great! Datalab is the hosted API for Marker. Would you like me to look up the Datalab API documentation to see how to use it for markdown extraction?

Yes

🤖Reply🤖

Perfect! I found the Datalab API documentation. Here’s the key info:

* The marker endpoint is at /api/v1/marker, and you can submit requests with parameters like output_format (set to ‘markdown’), force_ocr, use_llm, etc.

Basic Python example:

import requests

url = "https://www.datalab.to/api/v1/marker"
form_data = {
    'file': ('test.pdf', open('path/to/file.pdf', 'rb'), 'application/pdf'),
    'output_format': (None, 'markdown'),  # or 'chunks'
}
headers = {"X-Api-Key": "YOUR_API_KEY"}
response = requests.post(url, files=form_data, headers=headers)

* This returns a request_id that you use to retrieve the result later.

Would you like to try calling the API with one of your PDFs?

I have the following code #| export class HybridSearch: def init(self, all_chunks, dense_embds, tokenized_corpus): fc.store_attr() self.bm25 = BM25Okapi(self.tokenized_corpus)

def search(self, query, limit=5):
    query_emb = embedder.create_dembs(texts=[query])[0]
    tokenized_query = query.lower().split()
    results = self.bm25.get_top_n(tokenized_query, self.all_chunks, n=limit)
    scores = cosine_similarity([query_emb], self.dense_embds)[0]
    # print(len(scores), scores[0], len(scores[0]))
    idx = np.argsort(scores)[-limit:][::-1]
    results.extend(self.all_chunks[idx])
    return results

I want to use qwen reranker, also I want to include the preceding and next chunk for each chunk in the results

🤖Reply🤖

Good improvements! Let me break this down into steps:

For reranking with Qwen: You’ll need to combine the BM25 and dense results first, then pass them to the reranker to get the final top-k.

For including preceding/next chunks: You’ll need to track chunk indices so you can grab adjacent chunks.

Which would you like to tackle first - adding the reranker or implementing the context window (prev/next chunks)?

Including the prev and next chunks

🤖Reply🤖

🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠

Good choice! To include prev/next chunks, you’ll need to:

  1. Track chunk indices in your results
  2. Fetch adjacent chunks based on those indices
  3. Handle edge cases (first/last chunks in a document)

Here’s a helper method you could add to your HybridSearch class:

def get_context_chunks(self, chunk_indices, window=1):
    """Get chunks with surrounding context"""
    result_chunks = []
    seen = set()
    
    for idx in chunk_indices:
        start = max(0, idx - window)
        end = min(len(self.all_chunks), idx + window + 1)
        
        for i in range(start, end):
            if i not in seen:
                result_chunks.append((i, self.all_chunks[i]))
                seen.add(i)
    
    return [chunk for _, chunk in sorted(result_chunks)]

However, there’s one issue - you probably only want to include adjacent chunks from the same document. Should we add a check for doc_id to prevent mixing chunks from different documents?

Yes

🤖Reply🤖

Perfect! Here’s an updated version that respects document boundaries:

def get_context_chunks(self, chunk_indices, window=1):
    """Get chunks with surrounding context, respecting document boundaries"""
    result_chunks = []
    seen = set()
    
    for idx in chunk_indices:
        current_doc_id = self.all_chunks[idx].doc_id
        
        # Find adjacent chunks in same document
        for offset in range(-window, window + 1):
            new_idx = idx + offset
            
            # Check bounds and same document
            if (0 <= new_idx < len(self.all_chunks) and 
                new_idx not in seen and
                self.all_chunks[new_idx].doc_id == current_doc_id):
                
                result_chunks.append((new_idx, self.all_chunks[new_idx]))
                seen.add(new_idx)
    
    return [chunk for _, chunk in sorted(result_chunks)]

Then update your search method to use it:

def search(self, query, limit=5, window=1):
    # ... your existing search logic ...
    
    # Combine and get top indices
    combined_indices = list(set(bm25_indices + dense_indices))[:limit]
    
    # Get chunks with context
    return self.get_context_chunks(combined_indices, window=window)

Would you like to integrate this into your current code?

Please follow fastai coding style

🤖Reply🤖

🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠

Here’s the fastai-style version:

def get_context_chunks(self, idxs, window=1):
    """Get chunks with surrounding context, respecting doc boundaries"""
    seen, res = set(), []
    for i in idxs:
        doc_id = self.all_chunks[i].doc_id
        for j in range(max(0, i-window), min(len(self.all_chunks), i+window+1)):
            if j not in seen and self.all_chunks[j].doc_id == doc_id:
                res.append((j, self.all_chunks[j]))
                seen.add(j)
    return fc.L(sorted(res)).itemgot(1)

And the updated search method:

def search(self, query, limit=5, window=1):
    q_emb = embedder.create_dembs(texts=[query])[0]
    tok_q = query.lower().split()
    
    bm25_res = self.bm25.get_top_n(tok_q, range(len(self.all_chunks)), n=limit)
    dense_scores = cosine_similarity([q_emb], self.dense_embds)[0]
    dense_idxs = np.argsort(dense_scores)[-limit:][::-1].tolist()
    
    idxs = list(set(bm25_res + dense_idxs))[:limit]
    return self.get_context_chunks(idxs, window=window)

Note: I changed the BM25 call to return indices instead of chunks directly so we can use them for context lookup. Would you like to try this?

Can you please make the get_context_chunks simpler with fixed window of 1?

🤖Reply🤖

Sure! Here’s a simpler version with window=1 fixed:

def get_context_chunks(self, idxs):
    """Get chunks with prev/next context, respecting doc boundaries"""
    seen, res = set(), []
    for i in idxs:
        doc_id = self.all_chunks[i].doc_id
        for j in [i-1, i, i+1]:
            if 0 <= j < len(self.all_chunks) and j not in seen and self.all_chunks[j].doc_id == doc_id:
                res.append((j, self.all_chunks[j]))
                seen.add(j)
    return fc.L(sorted(res)).itemgot(1)

Much cleaner! This checks only the immediate prev/current/next chunks for each result.

now lets use the qwen reranker

🤖Reply🤖

Great! For Qwen reranker, you’ll need to:

  1. Install the model (if using locally) or use an API
  2. Get initial candidates from BM25 + dense search (more than limit, e.g., limit * 2)
  3. Pass candidates to reranker with the query
  4. Return top limit reranked results

Are you planning to use: - Local Qwen reranker (via transformers/sentence-transformers)? - API-based (like through Fireworks or another provider)?

Which approach would you prefer?

Yes

🤖Reply🤖

🧠🧠🧠🧠🧠🧠🧠🧠

Found it! * The Fireworks reranking API uses the endpoint https://api.fireworks.ai/inference/v1/rerank with model fireworks/qwen3-reranker-8b.

Here’s how to add the reranker to your HybridSearch class:

import requests

def rerank(self, query, chunks, limit=5):
    """Rerank chunks using Fireworks Qwen3 reranker"""
    docs = [c.text for c in chunks]
    res = requests.post(
        "https://api.fireworks.ai/inference/v1/rerank",
        json={
            "model": "fireworks/qwen3-reranker-8b",
            "query": query,
            "documents": docs,
            "top_n": limit,
            "return_documents": True
        },
        headers={
            "Authorization": f"Bearer {os.environ['FIREWORKS_API_KEY']}",
            "Content-Type": "application/json"
        }
    )
    ranked_idxs = [r['index'] for r in res.json()['results']]
    return fc.L([chunks[i] for i in ranked_idxs])

Then update your search method to use it:

def search(self, query, limit=5):
    # Get more candidates than needed
    candidates = self._get_candidates(query, limit=limit*3)
    # Rerank and return top results
    return self.rerank(query, candidates, limit=limit)

Would you like to integrate this into your class?

You can also use the openai spec instead of the request library

🤖Reply🤖

🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠🧠

Good point! Here’s the simpler version using the OpenAI client:

def rerank(self, query, chunks, limit=5):
    """Rerank chunks using Fireworks Qwen3 reranker"""
    docs = [c.text for c in chunks]
    res = openai_cli.post(
        "/rerank",
        body={
            "model": "fireworks/qwen3-reranker-8b",
            "query": query,
            "documents": docs,
            "top_n": limit
        },
        cast_to=dict
    )
    ranked_idxs = [r['index'] for r in res['results']]
    return fc.L([chunks[i] for i in ranked_idxs])

Note: If the .post() method doesn’t work directly, you might need to use:

import httpx

def rerank(self, query, chunks, limit=5):
    docs = [c.text for c in chunks]
    res = httpx.post(
        f"{openai_cli.base_url}rerank",
        json={"model": "fireworks/qwen3-reranker-8b", "query": query, "documents": docs, "top_n": limit},
        headers={"Authorization": f"Bearer {os.environ['FIREWORKS_API_KEY']}"}
    ).json()
    return fc.L([chunks[r['index']] for r in res['results']])

Would you like to try one of these approaches?