Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clarification Needed on Interface Implementation #2415

Open
sorobedio opened this issue Oct 21, 2024 · 1 comment
Open

Clarification Needed on Interface Implementation #2415

sorobedio opened this issue Oct 21, 2024 · 1 comment
Labels
asking questions For asking for clarification / support on library usage.

Comments

@sorobedio
Copy link

Hello,

I've been working on creating my own evaluation loop based on your code, but I've encountered some difficulties. Could you please review my implementation and let me know if it is correct? If possible, I would also appreciate any suggestions for simplifying or improving the code.

Thank you for your help!

`from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from lm_eval.api.model import LM
from typing import List, Tuple

class MyCustomLM(LM):
def init(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", max_length: int = 2048):
"""
Initialize the model, tokenizer, and device settings.
"""
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)
self.max_length = max_length

def loglikelihood(self, requests: List[Tuple[str, str]]) -> List[Tuple[float, bool]]:
    """
    Compute the log-likelihood for a list of (context, continuation) pairs.
    """
    results = []

    for context, continuation in requests:
        # Tokenize the inputs
        inputs = self.tokenizer(context, return_tensors="pt", max_length=self.max_length, truncation=True).to(
            self.device)
        continuation_inputs = self.tokenizer(continuation, return_tensors="pt",
                                             add_special_tokens=False).input_ids.to(self.device)

        # Get the model's log probabilities
        with torch.no_grad():
            outputs = self.model(**inputs, labels=inputs.input_ids)
            log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)

        # Select the continuation log-probs and compute the log-likelihood
        cont_tokens = continuation_inputs[0]
        cont_log_probs = log_probs[0, -len(cont_tokens):-1]
        loglikelihood = cont_log_probs.gather(1, cont_tokens.unsqueeze(-1)).sum().item()

        # Check if the prediction matches the continuation
        prediction = log_probs.argmax(dim=-1)[0, -len(cont_tokens):]
        is_exact_match = torch.equal(prediction, cont_tokens)

        results.append((loglikelihood, is_exact_match))

    return results

def loglikelihood_rolling(self, requests: List[str]) -> List[Tuple[float, bool]]:
    """
    Compute the rolling log-likelihood for a list of contexts.
    """
    results = []
    for context in requests:
        encoded = self.tokenizer(context, return_tensors="pt", max_length=self.max_length, truncation=True).to(
            self.device)

        total_loglikelihood = 0
        for i in range(1, len(encoded.input_ids[0])):
            input_ids = encoded.input_ids[:, :i]
            labels = encoded.input_ids[:, 1:i + 1]

            with torch.no_grad():
                outputs = self.model(input_ids, labels=labels)
                log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)
                total_loglikelihood += log_probs.gather(2, labels.unsqueeze(-1)).sum().item()

        results.append((total_loglikelihood, True))  # Assuming rolling loglikelihood always succeeds in this case

    return results

def generate_until(self, requests: List[Tuple[str, dict]]) -> List[str]:
    """
    Generates text until a stopping criterion is reached.
    """
    results = []
    for context, gen_kwargs in requests:
        inputs = self.tokenizer(context, return_tensors="pt", max_length=self.max_length, truncation=True).to(
            self.device)
        gen_outputs = self.model.generate(inputs.input_ids,
                                          max_length=gen_kwargs.get("max_length", self.max_length), **gen_kwargs)

        # Decode the generated tokens to text
        generated_text = self.tokenizer.decode(gen_outputs[0], skip_special_tokens=True)
        results.append(generated_text)

    return results

def evaluate_on_mmlu(self, subject: str = 'abstract_algebra', split: str = 'validation'):
    """
    Evaluate the model on the MMLU (Massive Multitask Language Understanding) dataset for the given subject.
    """
    # Load the MMLU validation dataset
    dataset = load_dataset("cais/mmlu", subject, split=split)

    correct_predictions = 0
    total_predictions = 0

    for example in dataset:
        question = example['question']
        choices = example['choices']
        correct_answer_idx = example['answer']

        # Construct the instruction prompt
        instruction = f"""
        You are an expert in {subject}. Here is a multiple-choice question for you to answer.
        Question: {question}
        Choices: {', '.join([f"({chr(65 + i)}) {choice}" for i, choice in enumerate(choices)])}
        Please pick the correct choice (A, B, C, or D).
        """

        inputs = self.tokenizer(instruction, return_tensors="pt", max_length=self.max_length, truncation=True).to(
            self.device)

        # Generate the response
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_length=self.max_length)
            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Check if the generated response contains only the correct letter (A, B, C, or D)
        generated_answer = None
        for letter in ['A', 'B', 'C', 'D']:
            if letter in generated_text:
                generated_answer = letter
                break

        # Map the generated answer to the correct index
        if generated_answer:
            generated_idx = ord(generated_answer) - ord('A')
        else:
            generated_idx = -1  # If no valid answer is generated

        # Evaluate if the generated answer matches the correct one
        if generated_idx == correct_answer_idx:
            correct_predictions += 1
        total_predictions += 1

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions * 100
    print(f"Accuracy on MMLU '{subject}': {accuracy:.2f}%")

    return accuracy

Example usage

model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = MyCustomLM(model_name=model_name)

Evaluate on the abstract algebra subset of the MMLU dataset

model.evaluate_on_mmlu(subject='abstract_algebra')
`

@baberabb
Copy link
Contributor

Hi! the loglikelihood implementation is not quite right. We need to calculate the conditional logliklehood of the continuations conditioned on the context. So something like:

logits = model(context + continuation[:-1]) # [:-1] because for each input token, model predicts the next token in the sequence
continuation_logits = logits[context_len:]

Some details in the docstrings.

@baberabb baberabb added the asking questions For asking for clarification / support on library usage. label Oct 23, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
asking questions For asking for clarification / support on library usage.
Projects
None yet
Development

No branches or pull requests

2 participants