Clarification Needed on Interface Implementation #2415

sorobedio · 2024-10-21T03:51:11Z

Hello,

I've been working on creating my own evaluation loop based on your code, but I've encountered some difficulties. Could you please review my implementation and let me know if it is correct? If possible, I would also appreciate any suggestions for simplifying or improving the code.

Thank you for your help!

`from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
from lm_eval.api.model import LM
from typing import List, Tuple

class MyCustomLM(LM):
def init(self, model_name: str = "meta-llama/Llama-3.2-1B-Instruct", max_length: int = 2048):
"""
Initialize the model, tokenizer, and device settings.
"""
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)
self.max_length = max_length

def loglikelihood(self, requests: List[Tuple[str, str]]) -> List[Tuple[float, bool]]:
    """
    Compute the log-likelihood for a list of (context, continuation) pairs.
    """
    results = []

    for context, continuation in requests:
        # Tokenize the inputs
        inputs = self.tokenizer(context, return_tensors="pt", max_length=self.max_length, truncation=True).to(
            self.device)
        continuation_inputs = self.tokenizer(continuation, return_tensors="pt",
                                             add_special_tokens=False).input_ids.to(self.device)

        # Get the model's log probabilities
        with torch.no_grad():
            outputs = self.model(**inputs, labels=inputs.input_ids)
            log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)

        # Select the continuation log-probs and compute the log-likelihood
        cont_tokens = continuation_inputs[0]
        cont_log_probs = log_probs[0, -len(cont_tokens):-1]
        loglikelihood = cont_log_probs.gather(1, cont_tokens.unsqueeze(-1)).sum().item()

        # Check if the prediction matches the continuation
        prediction = log_probs.argmax(dim=-1)[0, -len(cont_tokens):]
        is_exact_match = torch.equal(prediction, cont_tokens)

        results.append((loglikelihood, is_exact_match))

    return results

def loglikelihood_rolling(self, requests: List[str]) -> List[Tuple[float, bool]]:
    """
    Compute the rolling log-likelihood for a list of contexts.
    """
    results = []
    for context in requests:
        encoded = self.tokenizer(context, return_tensors="pt", max_length=self.max_length, truncation=True).to(
            self.device)

        total_loglikelihood = 0
        for i in range(1, len(encoded.input_ids[0])):
            input_ids = encoded.input_ids[:, :i]
            labels = encoded.input_ids[:, 1:i + 1]

            with torch.no_grad():
                outputs = self.model(input_ids, labels=labels)
                log_probs = torch.nn.functional.log_softmax(outputs.logits, dim=-1)
                total_loglikelihood += log_probs.gather(2, labels.unsqueeze(-1)).sum().item()

        results.append((total_loglikelihood, True))  # Assuming rolling loglikelihood always succeeds in this case

    return results

def generate_until(self, requests: List[Tuple[str, dict]]) -> List[str]:
    """
    Generates text until a stopping criterion is reached.
    """
    results = []
    for context, gen_kwargs in requests:
        inputs = self.tokenizer(context, return_tensors="pt", max_length=self.max_length, truncation=True).to(
            self.device)
        gen_outputs = self.model.generate(inputs.input_ids,
                                          max_length=gen_kwargs.get("max_length", self.max_length), **gen_kwargs)

        # Decode the generated tokens to text
        generated_text = self.tokenizer.decode(gen_outputs[0], skip_special_tokens=True)
        results.append(generated_text)

    return results

def evaluate_on_mmlu(self, subject: str = 'abstract_algebra', split: str = 'validation'):
    """
    Evaluate the model on the MMLU (Massive Multitask Language Understanding) dataset for the given subject.
    """
    # Load the MMLU validation dataset
    dataset = load_dataset("cais/mmlu", subject, split=split)

    correct_predictions = 0
    total_predictions = 0

    for example in dataset:
        question = example['question']
        choices = example['choices']
        correct_answer_idx = example['answer']

        # Construct the instruction prompt
        instruction = f"""
        You are an expert in {subject}. Here is a multiple-choice question for you to answer.
        Question: {question}
        Choices: {', '.join([f"({chr(65 + i)}) {choice}" for i, choice in enumerate(choices)])}
        Please pick the correct choice (A, B, C, or D).
        """

        inputs = self.tokenizer(instruction, return_tensors="pt", max_length=self.max_length, truncation=True).to(
            self.device)

        # Generate the response
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_length=self.max_length)
            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Check if the generated response contains only the correct letter (A, B, C, or D)
        generated_answer = None
        for letter in ['A', 'B', 'C', 'D']:
            if letter in generated_text:
                generated_answer = letter
                break

        # Map the generated answer to the correct index
        if generated_answer:
            generated_idx = ord(generated_answer) - ord('A')
        else:
            generated_idx = -1  # If no valid answer is generated

        # Evaluate if the generated answer matches the correct one
        if generated_idx == correct_answer_idx:
            correct_predictions += 1
        total_predictions += 1

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions * 100
    print(f"Accuracy on MMLU '{subject}': {accuracy:.2f}%")

    return accuracy

Example usage

model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = MyCustomLM(model_name=model_name)

Evaluate on the abstract algebra subset of the MMLU dataset

model.evaluate_on_mmlu(subject='abstract_algebra')
`

The text was updated successfully, but these errors were encountered:

baberabb · 2024-10-23T04:51:45Z

Hi! the loglikelihood implementation is not quite right. We need to calculate the conditional logliklehood of the continuations conditioned on the context. So something like:

logits = model(context + continuation[:-1]) # [:-1] because for each input token, model predicts the next token in the sequence
continuation_logits = logits[context_len:]

Some details in the docstrings.

baberabb added the asking questions For asking for clarification / support on library usage. label Oct 23, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Clarification Needed on Interface Implementation #2415

Clarification Needed on Interface Implementation #2415

sorobedio commented Oct 21, 2024

baberabb commented Oct 23, 2024

Clarification Needed on Interface Implementation #2415

Clarification Needed on Interface Implementation #2415

Comments

sorobedio commented Oct 21, 2024

Example usage

Evaluate on the abstract algebra subset of the MMLU dataset

baberabb commented Oct 23, 2024