You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I've been working on creating my own evaluation loop based on your code, but I've encountered some difficulties. Could you please review my implementation and let me know if it is correct? If possible, I would also appreciate any suggestions for simplifying or improving the code.
Thank you for your help!
`from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from lm_eval.api.model import LM
from typing import List, Tuple
class MyCustomLM(LM):
def init(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", max_length: int = 2048):
"""
Initialize the model, tokenizer, and device settings.
"""
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)
self.max_length = max_length
def loglikelihood(self, requests: List[Tuple[str, str]]) -> List[Tuple[float, bool]]:
"""
Compute the log-likelihood for a list of (context, continuation) pairs.
"""
results = []
for context, continuation in requests:
# Tokenize the inputs
inputs = self.tokenizer(context, return_tensors="pt", max_length=self.max_length, truncation=True).to(
self.device)
continuation_inputs = self.tokenizer(continuation, return_tensors="pt",
add_special_tokens=False).input_ids.to(self.device)
# Get the model's log probabilities
with torch.no_grad():
outputs = self.model(**inputs, labels=inputs.input_ids)
log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)
# Select the continuation log-probs and compute the log-likelihood
cont_tokens = continuation_inputs[0]
cont_log_probs = log_probs[0, -len(cont_tokens):-1]
loglikelihood = cont_log_probs.gather(1, cont_tokens.unsqueeze(-1)).sum().item()
# Check if the prediction matches the continuation
prediction = log_probs.argmax(dim=-1)[0, -len(cont_tokens):]
is_exact_match = torch.equal(prediction, cont_tokens)
results.append((loglikelihood, is_exact_match))
return results
def loglikelihood_rolling(self, requests: List[str]) -> List[Tuple[float, bool]]:
"""
Compute the rolling log-likelihood for a list of contexts.
"""
results = []
for context in requests:
encoded = self.tokenizer(context, return_tensors="pt", max_length=self.max_length, truncation=True).to(
self.device)
total_loglikelihood = 0
for i in range(1, len(encoded.input_ids[0])):
input_ids = encoded.input_ids[:, :i]
labels = encoded.input_ids[:, 1:i + 1]
with torch.no_grad():
outputs = self.model(input_ids, labels=labels)
log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)
total_loglikelihood += log_probs.gather(2, labels.unsqueeze(-1)).sum().item()
results.append((total_loglikelihood, True)) # Assuming rolling loglikelihood always succeeds in this case
return results
def generate_until(self, requests: List[Tuple[str, dict]]) -> List[str]:
"""
Generates text until a stopping criterion is reached.
"""
results = []
for context, gen_kwargs in requests:
inputs = self.tokenizer(context, return_tensors="pt", max_length=self.max_length, truncation=True).to(
self.device)
gen_outputs = self.model.generate(inputs.input_ids,
max_length=gen_kwargs.get("max_length", self.max_length), **gen_kwargs)
# Decode the generated tokens to text
generated_text = self.tokenizer.decode(gen_outputs[0], skip_special_tokens=True)
results.append(generated_text)
return results
def evaluate_on_mmlu(self, subject: str = 'abstract_algebra', split: str = 'validation'):
"""
Evaluate the model on the MMLU (Massive Multitask Language Understanding) dataset for the given subject.
"""
# Load the MMLU validation dataset
dataset = load_dataset("cais/mmlu", subject, split=split)
correct_predictions = 0
total_predictions = 0
for example in dataset:
question = example['question']
choices = example['choices']
correct_answer_idx = example['answer']
# Construct the instruction prompt
instruction = f"""
You are an expert in {subject}. Here is a multiple-choice question for you to answer.
Question: {question}
Choices: {', '.join([f"({chr(65 + i)}) {choice}" for i, choice in enumerate(choices)])}
Please pick the correct choice (A, B, C, or D).
"""
inputs = self.tokenizer(instruction, return_tensors="pt", max_length=self.max_length, truncation=True).to(
self.device)
# Generate the response
with torch.no_grad():
outputs = self.model.generate(**inputs, max_length=self.max_length)
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Check if the generated response contains only the correct letter (A, B, C, or D)
generated_answer = None
for letter in ['A', 'B', 'C', 'D']:
if letter in generated_text:
generated_answer = letter
break
# Map the generated answer to the correct index
if generated_answer:
generated_idx = ord(generated_answer) - ord('A')
else:
generated_idx = -1 # If no valid answer is generated
# Evaluate if the generated answer matches the correct one
if generated_idx == correct_answer_idx:
correct_predictions += 1
total_predictions += 1
# Calculate accuracy
accuracy = correct_predictions / total_predictions * 100
print(f"Accuracy on MMLU '{subject}': {accuracy:.2f}%")
return accuracy
Example usage
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = MyCustomLM(model_name=model_name)
Evaluate on the abstract algebra subset of the MMLU dataset
Hi! the loglikelihood implementation is not quite right. We need to calculate the conditional logliklehood of the continuations conditioned on the context. So something like:
logits=model(context+continuation[:-1]) # [:-1] because for each input token, model predicts the next token in the sequencecontinuation_logits=logits[context_len:]
Hello,
I've been working on creating my own evaluation loop based on your code, but I've encountered some difficulties. Could you please review my implementation and let me know if it is correct? If possible, I would also appreciate any suggestions for simplifying or improving the code.
Thank you for your help!
`from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from lm_eval.api.model import LM
from typing import List, Tuple
class MyCustomLM(LM):
def init(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", max_length: int = 2048):
"""
Initialize the model, tokenizer, and device settings.
"""
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)
self.max_length = max_length
Example usage
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = MyCustomLM(model_name=model_name)
Evaluate on the abstract algebra subset of the MMLU dataset
model.evaluate_on_mmlu(subject='abstract_algebra')
`
The text was updated successfully, but these errors were encountered: