diff --git a/google-gemma-Gemma3-4B/qnn-cpu/copy_embed_quant_param.py b/google-gemma-Gemma3-4B/qnn-cpu/copy_embed_quant_param.py new file mode 100644 index 00000000..c748cfff --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/copy_embed_quant_param.py @@ -0,0 +1,52 @@ +import onnx +from onnx import OperatorSetIdProto + +QUANT_MODEL = r"models/gemma3_qnn/model/embeddings.onnx" +IMAGE_MODEL = r"models/gemma-3-4b-it-embed/model/model.onnx" +# TARGET_NODE = r"/embedding_layer/Gather" +TARGET_NODE = r"node_embedding" +SOURCE_NODE = r"/model/embed_tokens/Gather_Q4" + +OUTPUT_MODEL = r"models/gemma3_qnn/model/embed_quant.onnx" + +def find_node_by_name(m, node_name): + for idx, node in enumerate(m.graph.node): + if node.name == node_name: + return (idx, node) + assert(f"node {node_name} not found in graph" == None) + +def find_initializer_by_name(m, init_name): + for idx, init in enumerate(m.graph.initializer): + if init.name == init_name: + return (idx, init) + assert(f"initializer {init_name} not found in graph" == None) + +im = onnx.load(IMAGE_MODEL) +qm = onnx.load(QUANT_MODEL) + +s_idx, s_node = find_node_by_name(qm, SOURCE_NODE) +i_idx, i_node = find_node_by_name(im, TARGET_NODE) + +assert(s_node.op_type == 'GatherBlockQuantized') +assert(i_node.op_type == 'Gather') + +w_idx, w_init = find_initializer_by_name(im, i_node.input[0]) # Gather weight +wq_idx, wq_init = find_initializer_by_name(qm, s_node.input[0]) # Gather Q4 weight +ws_idx, ws_init = find_initializer_by_name(qm, s_node.input[2]) # Gather Q4 weight + +# Modify graph +new_import = OperatorSetIdProto() +new_import.domain = s_node.domain +new_import.version = 1 +im.opset_import.extend([new_import]) + +gather_out = i_node.output[0] +del im.graph.node[i_idx] +im.graph.node.insert(i_idx, qm.graph.node[s_idx]) +im.graph.node[i_idx].output[0] = gather_out + +del im.graph.initializer[w_idx] +im.graph.initializer.append(wq_init) +im.graph.initializer.append(ws_init) + +onnx.save(im, OUTPUT_MODEL, save_as_external_data = True, all_tensors_to_one_file = True, location = OUTPUT_MODEL.split('/')[-1] + '.data') diff --git a/google-gemma-Gemma3-4B/qnn-cpu/custom_gemma3_4b_datasets.py b/google-gemma-Gemma3-4B/qnn-cpu/custom_gemma3_4b_datasets.py new file mode 100644 index 00000000..77751530 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/custom_gemma3_4b_datasets.py @@ -0,0 +1,526 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import copy +import logging +import os +import subprocess +import zipfile +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional + +import torch +from datasets import load_dataset +from huggingface_hub import hf_hub_download +from PIL import Image as PILImage +from transformers import ( + AutoModel, + AutoProcessor, + AutoTokenizer, +) + +from olive.data.registry import Registry + +logger = logging.getLogger(__name__) + + +class BaseGemmaDataset(ABC): + """Abstract base class for Gemma dataset implementations.""" + + CACHE_DIR = os.getenv("CACHE_DIR", ".cache") + + def __init__(self, model_id: str, first_n: Optional[int] = None): + self.model_id = model_id + self.first_n = first_n + self.processor = AutoProcessor.from_pretrained(self.model_id) + + # Initialize attributes that will be set during dataset loading + self.image_data_path = None + self.raw_datasets = None + + # Initialize processor components based on subclass requirements + self._initialize_processor_components() + + self.setup_dataset() + + @abstractmethod + def _initialize_processor_components(self): + """Initialize processor components specific to the dataset type.""" + + @abstractmethod + def _process_dataset_entry(self, entry: dict[str, any]): + """Process a single dataset entry according to the dataset type.""" + + def _convert_single_llava_to_gemma_conversation( + self, conversation: list[dict[str, str]], strip_images: bool = False + ) -> dict[str, str | list[dict]]: + """Convert a single llava-style conversation entry to Gemma-style. + + Args: + conversation: The conversation entry to convert + strip_images: If True, remove tokens and create text-only content. + If False, preserve tokens and create multimodal content. + + Examples: + >>> conversation = {"from": "human", "value": "What are the colors of the bus in the image?"} + >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=False) + { + 'role': 'user', + 'content': [{'type': 'image'}, {'type': 'text', 'text': 'What are the colors of the bus in the image?'}] + } + >>> _convert_single_llava_to_gemma_conversation(conversation, strip_images=True) + { + 'role': 'user', + 'content': [{'type': 'text', 'text': 'What are the colors of the bus in the image?'}] + } + + """ + who = conversation.get("from") + match who: + case "human": + role = "user" + case "gpt": + role = "assistant" + case _: + raise ValueError(f"Unknown role: {who}") + + text = conversation.get("value") + + if strip_images: + # Text-only: remove image references completely + text = text.replace("", "").strip() + return { + "role": role, + "content": [{"type": "text", "text": text}], + } + else: + # Multimodal: preserve image references + if "" in text: + has_image = True + text = text.replace("", "") + else: + has_image = False + + return { + "role": role, + "content": ( + [{"type": "image"}, {"type": "text", "text": text}] + if has_image + else [{"type": "text", "text": text}] + ), + } + + def _convert_llava_to_gemma_conversation(self, entry: dict[str, any], strip_images: bool = False): + """Convert LlaVA-style conversations to Gemma-style.""" + entry["text"] = [ + self._convert_single_llava_to_gemma_conversation(conversation, strip_images=strip_images) + for conversation in entry["conversations"] + ] + del entry["conversations"] + return entry + + def _download_and_extract_images(self): + """Download the COCO train2017 image dataset and extract to the cache directory.""" + zip_filename = "train2017.zip" + zip_path = os.path.join(self.CACHE_DIR, zip_filename) + extract_path = os.path.join(self.CACHE_DIR, "train2017") + + # Create cache directory if it doesn't exist + os.makedirs(self.CACHE_DIR, exist_ok=True) + + # Check if images are already downloaded and extracted + extract_path_obj = Path(extract_path) + if extract_path_obj.exists() and any(extract_path_obj.iterdir()): + logger.info("Images already exist at %s", extract_path) + return extract_path + + # Download the dataset if zip doesn't exist + if not os.path.exists(zip_path): + logger.info("Downloading COCO train2017 dataset to %s", zip_path) + try: + subprocess.run( + [ + "wget", + "https://images.cocodataset.org/zips/train2017.zip", + "--no-check-certificate", + "-O", + zip_path, + ], + check=True, + ) + logger.info("Download completed successfully") + except subprocess.CalledProcessError: + logger.exception("Failed to download dataset") + raise + except FileNotFoundError: + logger.exception("wget command not found. Please install wget or use an alternative download method.") + raise + + # Extract the zip file + logger.info("Extracting %s to %s", zip_path, self.CACHE_DIR) + try: + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(self.CACHE_DIR) + logger.info("Extraction completed successfully") + except zipfile.BadZipFile: + logger.exception("Failed to extract zip file") + # Remove corrupted zip file so it can be re-downloaded + if os.path.exists(zip_path): + os.remove(zip_path) + raise + + return extract_path + + def _load_base_dataset(self): + """Load the base LlaVA dataset.""" + # Issue with Arrow leads to errors when using load_dataset directly on liuhaotian/LLaVA-Instruct-150K + file_path = hf_hub_download( + repo_id="liuhaotian/LLaVA-Instruct-150K", + filename="llava_instruct_80k.json", + repo_type="dataset", + cache_dir=self.CACHE_DIR, + ) + + self.image_data_path = self._download_and_extract_images() + self.raw_datasets = load_dataset("json", data_files=[file_path], split="train") + + # Limit data processing to the first_n rows + self.raw_datasets = self.raw_datasets if self.first_n is None else self.raw_datasets.select(range(self.first_n)) + + def _extract_image_details(self, entry: dict[str, any]): + """Extract image details from the dataset example. + + Opens the image file and adds image mode information to the example. + """ + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"])) + entry["image_mode"] = image.mode + return entry + + def setup_dataset(self): + """Set up the dataset with common preprocessing steps.""" + self._load_base_dataset() + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply dataset-specific processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def get_dataset(self): + """Return the processed dataset.""" + return self.raw_datasets + + +class GemmaMultimodalDataset(BaseGemmaDataset): + """Dataset for full E2E Gemma 3 multi-modal model including both image and text.""" + + def _initialize_processor_components(self): + """Initialize tokenizer for multimodal processing.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def setup_dataset(self): + """Set up the multimodal dataset with text conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (preserve images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) + ) + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply multimodal processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Load image and tokenize the conversation for model input. + + Args: + entry: Dataset entry containing text conversation and image path + + Returns: + Tokenized inputs ready for model processing + + """ + return self.processor.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + + +class GemmaTextOnlyDataset(BaseGemmaDataset): + """Dataset for only the text portion of the Gemma 3 model.""" + + def _initialize_processor_components(self): + """Initialize tokenizer for text-only processing.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def setup_dataset(self): + """Set up the text-only dataset with conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (strip images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=True) + ) + + # Extract image details (still needed for filtering) + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply text-only processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Extract and tokenize only the text content. + + Args: + entry: Dataset entry containing text conversation + + Returns: + Tokenized text inputs ready for model processing + + """ + # Apply chat template without images, text-only + inputs = self.tokenizer.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + return {k: v.squeeze(0) for k, v in inputs.items()} # Remove batch dimension + + +class GemmaImageDataset(BaseGemmaDataset): + """Dataset for only the image processing of the Gemma 3 model.""" + + def _initialize_processor_components(self): + """No additional components needed for image-only processing.""" + + def _process_dataset_entry(self, entry: dict[str, any]): + """Load image and extract only pixel_values for image-only processing.""" + # Load and process the image + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) + + # Process image to get pixel_values + inputs = self.processor(text="", images=image, return_tensors="pt") + + # Return only pixel_values + return {"pixel_values": inputs["pixel_values"]} + + +class GemmaEmbeddingInputDataset(BaseGemmaDataset): + """Dataset that is the input to the embedding layer.""" + + def __init__(self, model_id, first_n=None): + # Initialize lazy-loaded model components + self._vision_tower = None + self._multi_modal_projector = None + + super().__init__(model_id, first_n) + + def _initialize_processor_components(self): + """Initialize only standard processor components.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def _get_vision_components(self): + """Lazy-load vision model components when first needed.""" + if self._vision_tower is None: + logger.info("Loading vision model components for cached embedding dataset") + full_model = AutoModel.from_pretrained(self.model_id) + + # Extract vision components (equivalent to Gemma3VisualEmbeddingGenerator) + self._vision_tower = full_model.vision_tower + self._multi_modal_projector = full_model.multi_modal_projector + + # Clean up full model to save memory + del full_model.language_model + + return self._vision_tower.cuda(), self._multi_modal_projector.cuda() + + def setup_dataset(self): + """Set up the multimodal dataset with text conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (preserve images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) + ) + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply multimodal processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Process entry to return input_ids and cached image features.""" + # Convert conversation and tokenize + inputs = self.processor.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + + # Load and process image + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) + pixel_values = torch.tensor(self.processor(text="", images=image).pixel_values) + + # Get vision components and extract features + vision_tower, projector = self._get_vision_components() + pixel_values = pixel_values.to(device="cuda") + + with torch.no_grad(): + # Process through vision tower + image_outputs = vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.last_hidden_state + # Project to final embedding space + image_features = projector(selected_image_feature) + # Convert to numpy for caching + image_features = image_features.cpu().detach().numpy() + + return {"input_ids": inputs["input_ids"], "image_features": image_features} + + +class GemmaEmbeddingDataset(BaseGemmaDataset): + """Dataset that pre-merges text and image embeddings.""" + + def __init__(self, model_id, first_n=None): + # Initialize lazy-loaded model components + self._vision_tower = None + self._multi_modal_projector = None + self._embedding_layer = None + + super().__init__(model_id, first_n) + + def _initialize_processor_components(self): + """Initialize only standard processor components.""" + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, cache_dir=self.CACHE_DIR, use_fast=True, trust_remote_code=True + ) + + def _get_model_components(self): + """Lazy-load all required model components when first needed.""" + if self._embedding_layer is None: + logger.info("Loading model components for merged embedding dataset") + full_model = AutoModel.from_pretrained(self.model_id) + + # Extract components + self._vision_tower = full_model.vision_tower.cuda() + self._multi_modal_projector = full_model.multi_modal_projector.cuda() + self._embedding_layer = copy.deepcopy(full_model.language_model.embed_tokens).cuda() + + # Clean up full model + del full_model.language_model + + return self._vision_tower, self._multi_modal_projector, self._embedding_layer + + def _merge_embeddings(self, input_ids: torch.Tensor, pixel_values: torch.Tensor): + """Merge text and image embeddings at special token positions.""" + vision_tower, projector, embedding_layer = self._get_model_components() + + # Get text embeddings + inputs_embeds = embedding_layer(input_ids.to(device="cuda")) + + # Process image + pixel_values = pixel_values.to(dtype=inputs_embeds.dtype, device="cuda") + with torch.no_grad(): + image_outputs = vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.last_hidden_state + image_features = projector(selected_image_feature) + + # Merge at special token positions (image_token_index = 262144) + image_token_index = 262144 + special_image_mask = (input_ids == image_token_index).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + return inputs_embeds.masked_scatter(special_image_mask, image_features) + + def setup_dataset(self): + """Set up the multimodal dataset with text conversation conversion.""" + self._load_base_dataset() + + # Convert the Llava-style conversation to Gemma-style conversation (preserve images) + self.raw_datasets = self.raw_datasets.map( + lambda entry: self._convert_llava_to_gemma_conversation(entry, strip_images=False) + ) + + # Extract image details + self.raw_datasets = self.raw_datasets.map(self._extract_image_details) + + # Filter out any images that are not RGB + self.raw_datasets = self.raw_datasets.filter(lambda x: x["image_mode"] == "RGB") + + # Apply multimodal processing + self.raw_datasets = self.raw_datasets.with_transform(self._process_dataset_entry) + + def _process_dataset_entry(self, entry: dict[str, any]): + """Process entry to return merged embeddings.""" + # Convert conversation and tokenize + inputs = self.processor.apply_chat_template( + entry["text"][0], add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True + ) + + # Load and process image + image = PILImage.open(fp=os.path.join(self.image_data_path, entry["image"][0])) + pixel_values = torch.tensor(self.processor(text="", images=image).pixel_values) + + # Merge embeddings + inputs_embeds = self._merge_embeddings(inputs["input_ids"], pixel_values) + + return { + "input_ids": inputs["input_ids"], + "inputs_embeds": inputs_embeds, + "attention_mask": inputs["attention_mask"].squeeze(0), + } + + +# Remove this when submitting for review +TEXT_SHORTCUT_FIRST_N = 600 +SHORTCUT_FIRST_N = 200 + + +@Registry.register_dataset() +def gemma_dataset(model_id: str): + """Full E2E Gemma 3 multi-modal dataset (image + text).""" + return GemmaMultimodalDataset(model_id, first_n=TEXT_SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_text_dataset(model_id: str): + """Text-only Gemma 3 dataset.""" + return GemmaTextOnlyDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_image_dataset(model_id: str): + """Image-only Gemma 3 dataset.""" + return GemmaImageDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_embedding_input_dataset(model_id: str): + """Gemma 3 dataset with embedding layer input.""" + return GemmaEmbeddingInputDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() + + +@Registry.register_dataset() +def gemma_embedding_dataset(model_id: str): + """Gemma 3 dataset with pre-merged text and image embeddings.""" + return GemmaEmbeddingDataset(model_id, first_n=SHORTCUT_FIRST_N).get_dataset() diff --git a/google-gemma-Gemma3-4B/qnn-cpu/custom_gemma3_4b_embedding.py b/google-gemma-Gemma3-4B/qnn-cpu/custom_gemma3_4b_embedding.py new file mode 100644 index 00000000..97c9cf2e --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/custom_gemma3_4b_embedding.py @@ -0,0 +1,37 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + + +import logging + +import torch +from transformers import AutoModel + +logger = logging.getLogger(__name__) + + +class EmbeddingLayer(torch.nn.Module): + def __init__(self, full_model): + super().__init__() + self.embedding_layer = full_model.language_model.embed_tokens + + def forward(self, input_ids, image_features): + image_token_index = 262144 + inputs_embeds = self.embedding_layer(input_ids) + + special_image_mask = (input_ids == image_token_index).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + return inputs_embeds.masked_scatter(special_image_mask, image_features) + + +def load_gemma3_embedding_model(model_path): + full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") + logger.info("Loaded full model: %s", full_model) + + embedding_layer = EmbeddingLayer(full_model) + + logger.info("Created embedding-only model: %s", embedding_layer) + return embedding_layer diff --git a/google-gemma-Gemma3-4B/qnn-cpu/custom_gemma3_4b_vision.py b/google-gemma-Gemma3-4B/qnn-cpu/custom_gemma3_4b_vision.py new file mode 100644 index 00000000..1eb7f8f3 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/custom_gemma3_4b_vision.py @@ -0,0 +1,36 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + + +import logging + +import torch +from transformers import AutoModel + +logger = logging.getLogger(__name__) + + +class Gemma3VisualEmbeddingGenerator(torch.nn.Module): + def __init__(self, full_model): + super().__init__() + # Extract only the vision components + self.vision_tower = full_model.vision_tower + self.multi_modal_projector = full_model.multi_modal_projector + + def forward(self, pixel_values): + # Process images through vision tower + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.last_hidden_state + # Project to final embedding space + return self.multi_modal_projector(selected_image_feature) + + +def load_gemma3_vision_model(model_path): + full_model = AutoModel.from_pretrained("google/gemma-3-4b-it") + logger.info("Loaded full model: %s", full_model) + + vision_model = Gemma3VisualEmbeddingGenerator(full_model) + logger.info("Created vision-only model: %s", vision_model) + return vision_model diff --git a/google-gemma-Gemma3-4B/qnn-cpu/env_setup.sh b/google-gemma-Gemma3-4B/qnn-cpu/env_setup.sh new file mode 100644 index 00000000..aa117afc --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/env_setup.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +# Installing setuptools to build Olive from source +uv pip install setuptools + +# Requires installation of uv +uv pip install -r ../requirements.txt + +# Require installation of Olive dependencies +uv pip install -r ../../../requirements.txt + +# Disable CUDA extension build +export BUILD_CUDA_EXT=0 + +# Install AutoGPTQ from source +uv pip install --no-build-isolation git+https://github.com/PanQiWei/AutoGPTQ.git + +# Install GptqModel from source +# Note: Commit hash corresponds to commit which fixes Gemma 3 memory leak issue. See README.md for additional details. +uv pip install --no-build-isolation git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d + +# Install onnxruntime-qnn without installing onnxruntime +uv pip install -r https://raw.githubusercontent.com/microsoft/onnxruntime/refs/heads/main/requirements.txt +uv pip install -U --pre --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple onnxruntime-qnn --no-deps diff --git a/google-gemma-Gemma3-4B/qnn-cpu/gemma-3-4b.ipynb b/google-gemma-Gemma3-4B/qnn-cpu/gemma-3-4b.ipynb new file mode 100644 index 00000000..50b17bb8 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/gemma-3-4b.ipynb @@ -0,0 +1,379 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Gemma 3 4B QNN model conversion with Olive \n", + "### Task: Text + Vision Generation 📝\n", + "\n", + "In this notebook, you'll:\n", + "- Download the required datasets\n", + "- Quantize language model\n", + "- Convert Vision to QNN format\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Platform requirements\n", + "This notebook is intended to run on a machine with:\n", + " * **Operating System**: Linux Ubuntu 22.04 (automated setup script is Linux-only)\n", + " * **Python**: 3.10\n", + " * NVIDIA driver version equivalent to 525.60.13\n", + " * NVIDIA A100 GPU\n", + " * **Storage**: ~13GB for COCO train2017 dataset (downloaded automatically)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "🐍 Python Virtual environments\n", + "Creates Olive and QNN python virtual environments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!git clone https://github.com/CodeLinaro/Olive.git -b dev/qti-tbhardwa/gemma3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import venv\n", + "from pathlib import Path\n", + "import subprocess\n", + "import json\n", + "import shutil\n", + "import urllib.request\n", + "import onnx\n", + "from onnx import helper, TensorProto\n", + "import glob\n", + "\n", + "current_dir = os.getcwd()\n", + "MODEL=\"google/gemma-3-4b-it\"\n", + "OLIVE_PYTHON_PATH = './olive_venv'\n", + "OLIVE_PYTHON_BIN = './olive_venv/bin/python'\n", + "olive_pip_path = Path(OLIVE_PYTHON_PATH) / \"bin\" / \"pip\"\n", + "OLIVE_REPO_PATH = Path(\"./Olive\")\n", + "OLIVE_REQ = \"./requirements.txt\"\n", + "QNN_REQ = \"./qnn_req.txt\"\n", + "\n", + "QNN_PYTHON_PATH = './qnn_venv'\n", + "QNN_PYTHON_BIN_PATH = './qnn_venv/bin'\n", + "qnn_pip_path = Path(QNN_PYTHON_PATH) / \"bin\" / \"pip\"\n", + "QNN_PYTHON_BIN_FULL_PATH = f\"{current_dir}/{QNN_PYTHON_BIN_PATH}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Olive Python Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "if not os.path.exists(OLIVE_PYTHON_PATH):\n", + " print(\"Creating Olive Venv\")\n", + " builder = venv.EnvBuilder(with_pip=True)\n", + " builder.create(Path(OLIVE_PYTHON_PATH))\n", + "my_env = os.environ.copy()\n", + "my_env[\"BUILD_CUDA_EXT\"] = \"0\"\n", + "GPTQ=\"git+https://github.com/ModelCloud/GPTQModel.git@558449bed3ef2653c36041650d30da6bbbca440d\"\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"-U\", \"-r\" , OLIVE_REQ], env=my_env)\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"--no-build-isolation\", GPTQ], env=my_env)\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])\n", + "subprocess.check_call([str(olive_pip_path), \"install\", \"onnxscript==0.5.6\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare QNN Python Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "if not os.path.exists(QNN_PYTHON_PATH):\n", + " print(\"Creating QNN Venv\")\n", + " builder = venv.EnvBuilder(with_pip=True)\n", + " builder.create(Path(QNN_PYTHON_PATH))\n", + "subprocess.check_call([str(qnn_pip_path), \"install\", \"--no-build-isolation\", \"-r\" , QNN_REQ], env=my_env)\n", + "subprocess.check_call([str(qnn_pip_path), \"install\", \"-e\", OLIVE_REPO_PATH])\n", + "subprocess.check_call([str(qnn_pip_path), \"install\", \"-U\", \"--pre\", \"--extra-index-url\",\n", + " \"https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple\",\n", + " \"onnxruntime-qnn==1.23.2\", \"--no-deps\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🤗 Login to Hugging Face\n", + "To access models, you'll need to log-in to Hugging Face with a [user access token](https://huggingface.co/docs/hub/security-tokens). The following command will run you through the steps to login:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli login --token <>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Apply few patches to Onnxruntime and GPTQModel\n", + "\n", + "This is needed for running the Olive recipies for this model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!patch ./olive_venv/lib/python3.10/site-packages/gptqmodel/utils/model.py < gptqmodel_int8.patch\n", + "!patch ./olive_venv/lib/python3.10/site-packages/onnxruntime_genai/models/builder.py < oga_patch1.patch\n", + "!patch ./olive_venv/lib/python3.10/site-packages/onnxruntime_genai/models/quantized_model.py < oga_patch2.patch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "base_url = \"https://raw.githubusercontent.com/CodeLinaro/onnxruntime/326d9d30129bbad698e0306d24dcea0ec5a19e60\"\n", + "urls = [\n", + " base_url + \"/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py\",\n", + " base_url + \"/onnxruntime/python/tools/quantization/quant_utils.py\"\n", + "]\n", + "\n", + "destinations = [\n", + " OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/execution_providers/qnn/quant_config.py\",\n", + " OLIVE_PYTHON_PATH+\"/lib/python3.10/site-packages/onnxruntime/quantization/quant_utils.py\"\n", + "]\n", + "\n", + "for url, dest in zip(urls, destinations):\n", + " urllib.request.urlretrieve(url, dest)\n", + " print(f\"Downloaded and replaced: {dest}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Olive Recipes" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**GPU utilization observed during the run**\n", + "\n", + "\t\ta. Text GPTQModel quantization: 12gb\n", + "\t\tb. Text Onnx static quantization: 41gb\n", + "\t\tc. Vision Onnx static quantization: 68gb\n", + " d. Embedding Onnx static quantization: 3gb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Clean Context binary directories if they exist\n", + "def clean_directory(path):\n", + " if os.path.exists(path):\n", + " for file in glob.glob(os.path.join(path, '*')):\n", + " if os.path.isfile(file):\n", + " os.remove(file)\n", + "dirs_to_clean = [\n", + " './models/gemma3_qnn/model/',\n", + " './models/gemma-3-4b-it-vision/model/',\n", + " './models/gemma-3-4b-it-embed/model/'\n", + "]\n", + "\n", + "for dir_path in dirs_to_clean:\n", + " clean_directory(dir_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1️⃣ LLM model generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config_path = Path(f\"./gemma3-4b-text-qnn-config.json\")\n", + "with open(config_path, \"r\") as file:\n", + " data = json.load(file)\n", + "\n", + "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n", + "data[\"input_model\"][\"model_path\"] = MODEL\n", + "\n", + "with open(config_path, \"w\") as file:\n", + " json.dump(data, file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!./olive_venv/bin/olive run --config ./gemma3-4b-text-qnn-config.json\n", + "# Do weight sharing in context and iterator model\n", + "!python share_data.py\n", + "!unlink models/gemma3_qnn/model/iterator.onnx.data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2️⃣ Vision model Quantization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config_path = Path(f\"./gemma3-4b-vision-qnn-config.json\")\n", + "with open(config_path, \"r\") as file:\n", + " data = json.load(file)\n", + "data[\"systems\"][\"qnn_system\"][\"python_environment_path\"] = QNN_PYTHON_BIN_FULL_PATH\n", + "\n", + "with open(config_path, \"w\") as file:\n", + " json.dump(data, file, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!./olive_venv/bin/olive run --config ./gemma3-4b-vision-export.json\n", + "!python vision_allowzero.py\n", + "!./olive_venv/bin/olive run --config ./gemma3-4b-vision-qnn-config.json\n", + "!rm -rf models/gemma-3-4b-it-vis-onnx/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3️⃣ Embedding Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!./olive_venv/bin/olive run --config ./gemma3-4b-embedding-qnn-config.json" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare final ORT GenAI folder for on-device inference " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# copy quant emb weight in to vision emb model\n", + "!python copy_embed_quant_param.py\n", + "\n", + "!rm ./models/gemma3_qnn/model/embeddings.onnx\n", + "!cp ./models/gemma-3-4b-it-vision/model/model_updated_ctx.onnx ./models/gemma3_qnn/model/model_ctx_vision.onnx \n", + "!cp ./models/gemma-3-4b-it-vision/model/model_updated_ctx_qnn.bin ./models/gemma3_qnn/model/model_updated_ctx_qnn.bin \n", + "!cp ./genai/*.* ./models/gemma3_qnn/model/\n", + "!ls -al ./models/gemma3_qnn/model/\n", + "\n", + "print(\"ORT GenAI inference setup: ./models/gemma3_qnn\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-embedding-qnn-config.json b/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-embedding-qnn-config.json new file mode 100644 index 00000000..5e221b0a --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-embedding-qnn-config.json @@ -0,0 +1,33 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_script": "custom_gemma3_4b_embedding.py", + "model_loader": "load_gemma3_embedding_model", + "io_config": { + "input_names": [ "input_ids", "image_features" ], + "input_shapes": [ [ 1, 64 ], [ 1, 256, 2560 ] ], + "input_types": [ "int64", "float32" ], + "output_names": [ "/model/embed_tokens/Mul/output_0" ], + "output_shapes": [ [ 1, 64, 2560 ] ], + "dynamic_axes": { + "input_ids": { "0": "batch_size", "1": "seq_length" }, + "image_features": { "0": "batch_size", "1": "image_tokens_length" } + } + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "cpu", "execution_providers": [ "CPUExecutionProvider" ] } ] + } + }, + "passes": { + "conversion": { "type": "OnnxConversion", "target_opset": 20 }, + "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-embedding" } + }, + "target": "local_system", + "log_severity_level": 1, + "output_dir": "models/gemma-3-4b-it-embed", + "cache_dir": "cache-embd", + "no_artifacts": true +} diff --git a/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-text-qnn-config.json b/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-text-qnn-config.json new file mode 100644 index 00000000..824bd3f9 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-text-qnn-config.json @@ -0,0 +1,142 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google/gemma-3-4b-it", + "custom_task_class_name": "Gemma3ForCausalLM", + "custom_task_class_module": "transformers", + "untie_lm_head_weights": true + }, + "systems": { + "qnn_system": { + "type": "PythonEnvironment", + "python_environment_path": "", + "accelerators": [ + { + "execution_providers": [ + "QNNExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "wikitext2_train_joined", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "join", + "add_special_tokens": false, + "max_seq_len": 4096, + "max_samples": 128 + } + }, + { + "name": "wikitext2_train_act", + "type": "HuggingfaceContainer", + "load_dataset_config": { + "data_name": "wikitext", + "subset": "wikitext-2-raw-v1", + "split": "train" + }, + "pre_process_data_config": { + "strategy": "line-by-line", + "add_special_tokens": true, + "max_samples": 256, + "max_seq_len": 4096 + } + } + ], + "passes": { + "cs": { + "type": "CaptureSplitInfo", + "num_splits": 2, + "unique_embeds_lm_head_splits": true + }, + "g": { + "type": "GptqModel", + "bits": 4, + "sym": true, + "group_size": 128, + "lm_head": true, + "desc_act": false, + "device": "cuda", + "data_config": "wikitext2_train_joined", + "dynamic": { + "+:.*lm_head*": { + "bits": 8, + "sym": true, + "group_size": 16, + "desc_act": false + }, + "+:.*v_proj*": { + "bits": 8, + "sym": true, + "group_size": 32, + "desc_act": false + }, + "+:.*k_proj*": { + "bits": 8, + "sym": true, + "group_size": 32, + "desc_act": false + }, + "+:.*q_proj*": { + "bits": 8, + "sym": true, + "group_size": 32, + "desc_act": false + } + } + }, + "mb": { + "type": "ModelBuilder", + "precision": "int4", + "int4_block_size": 16, + "int4_accuracy_level": 4, + "int4_op_types_to_quantize": [ + "Gather" + ], + "extra_options": { + "use_packed_matmul": true + } + }, + "gs": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "RemoveRopeMultiCache" + }, + { + "surgeon": "AttentionMaskToSequenceLengths" + }, + { + "surgeon": "SimplifiedLayerNormToL2Norm" + } + ], + "save_as_external_data": true + }, + "sp": { + "type": "SplitModel", + "save_as_external_data": false + }, + "st": { + "type": "StaticLLM", + "batch_size": 1, + "context_length": 64, + "save_as_external_data": true + }, + "cp": { + "type": "ComposeOnnxModels" + } + }, + "target": "qnn_system", + "log_severity_level": 0, + "output_dir": "models/gemma3_qnn", + "cache_dir": "cache", + "no_artifacts": true +} diff --git a/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-vision-export.json b/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-vision-export.json new file mode 100644 index 00000000..68951629 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-vision-export.json @@ -0,0 +1,51 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_script": "custom_gemma3_4b_vision.py", + "model_loader": "load_gemma3_vision_model", + "io_config": { + "input_names": [ + "pixel_values" + ], + "input_shapes": [ + [ + 1, + 3, + 896, + 896 + ] + ], + "input_types": [ + "float32" + ], + "output_names": [ + "image_features" + ], + "output_shapes": [ + [ + 1, + 256, + 2560 + ] + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 20 + }, + "surgery": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "MatMulAddToGemm" + } + ] + } + }, + "log_severity_level": 1, + "output_dir": "models/gemma-3-4b-it-vis-onnx", + "cache_dir": "cache-vision", + "no_artifacts": true +} diff --git a/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-vision-qnn-config.json b/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-vision-qnn-config.json new file mode 100644 index 00000000..1e4f7a59 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/gemma3-4b-vision-qnn-config.json @@ -0,0 +1,65 @@ +{ + "input_model": { + "type": "ONNXModel", + "config": { + "model_path": "models/gemma-3-4b-it-vis-onnx/model_updated.onnx" + } + }, + "systems": { + "qnn_system": { + "type": "PythonEnvironment", + "python_environment_path": "", + "accelerators": [ + { + "execution_providers": [ + "QNNExecutionProvider" + ] + } + ] + } + }, + "data_configs": [ + { + "name": "gemma_vision_data_config", + "user_script": "custom_gemma3_4b_datasets.py", + "load_dataset_config": { + "type": "gemma_image_dataset", + "model_id": "google/gemma-3-4b-it" + } + } + ], + "passes": { + "quantization": { + "type": "OnnxStaticQuantization", + "quant_preprocess": true, + "data_config": "gemma_vision_data_config", + "activation_type": "uint16", + "precision": "uint8", + "calibrate_method": "MinMax", + "calibration_providers": [ + "CUDAExecutionProvider" + ], + "per_channel": true, + "weight_symmetric": true + }, + "cb": { + "type": "EPContextBinaryGenerator", + "provider_options": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "vtcm_mb": "8", + "htp_arch": "v73", + "soc_model": "60" + } + }, + "add_metadata": { + "type": "AddOliveMetadata", + "graph_name": "gemma-3-4b-it-vision" + } + }, + "target": "qnn_system", + "log_severity_level": 1, + "output_dir": "models/gemma-3-4b-it-vision", + "cache_dir": "cache-vision", + "no_artifacts": true +} diff --git a/google-gemma-Gemma3-4B/qnn-cpu/genai/genai_config.json b/google-gemma-Gemma3-4B/qnn-cpu/genai/genai_config.json new file mode 100644 index 00000000..df903eba --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/genai/genai_config.json @@ -0,0 +1,403 @@ +{ + "model": { + "bos_token_id": 2, + "context_length": 131072, + "decoder": { + "session_options": { + "log_id": "onnxruntime-genai", + "provider_options": [ + ] + }, + "head_size": 256, + "hidden_size": 2560, + "inputs": { + "input_ids":"input_ids", + "inputs_embeds": "inputs_embeds", + "attention_mask": "attention_mask", + "past_key_names": "past_key_values.%d.key", + "past_value_names": "past_key_values.%d.value", + "past_sequence_length": "past_seq_len", + "total_sequence_length": "total_seq_len" + }, + "outputs": { + "logits": "logits", + "present_key_names": "present.%d.key", + "present_value_names": "present.%d.value" + }, + "num_attention_heads": 8, + "num_hidden_layers": 34, + "num_key_value_heads": 4, + "sliding_window": { + "window_size": 64, + "slide_key_value_cache": false, + "slide_inputs": true, + "pad_value": 0, + "alignment": "left" + }, + "pipeline": [ + { + "context": { + "filename": "context.onnx", + "inputs": [ + "/model/embed_tokens/Mul/output_0", + "past_key_values.0.key", + "past_key_values.0.value", + "past_seq_len", + "total_seq_len", + "past_key_values.1.key", + "past_key_values.1.value", + "past_key_values.2.key", + "past_key_values.2.value", + "past_key_values.3.key", + "past_key_values.3.value", + "past_key_values.4.key", + "past_key_values.4.value", + "past_key_values.5.key", + "past_key_values.5.value", + "past_key_values.6.key", + "past_key_values.6.value", + "past_key_values.7.key", + "past_key_values.7.value", + "past_key_values.8.key", + "past_key_values.8.value", + "past_key_values.9.key", + "past_key_values.9.value", + "past_key_values.10.key", + "past_key_values.10.value", + "past_key_values.11.key", + "past_key_values.11.value", + "past_key_values.12.key", + "past_key_values.12.value", + "past_key_values.13.key", + "past_key_values.13.value", + "past_key_values.14.key", + "past_key_values.14.value", + "past_key_values.15.key", + "past_key_values.15.value", + "past_key_values.16.key", + "past_key_values.16.value", + "past_key_values.17.key", + "past_key_values.17.value", + "past_key_values.18.key", + "past_key_values.18.value", + "past_key_values.19.key", + "past_key_values.19.value", + "past_key_values.20.key", + "past_key_values.20.value", + "past_key_values.21.key", + "past_key_values.21.value", + "past_key_values.22.key", + "past_key_values.22.value", + "past_key_values.23.key", + "past_key_values.23.value", + "past_key_values.24.key", + "past_key_values.24.value", + "past_key_values.25.key", + "past_key_values.25.value", + "past_key_values.26.key", + "past_key_values.26.value", + "past_key_values.27.key", + "past_key_values.27.value", + "past_key_values.28.key", + "past_key_values.28.value", + "past_key_values.29.key", + "past_key_values.29.value", + "past_key_values.30.key", + "past_key_values.30.value", + "past_key_values.31.key", + "past_key_values.31.value", + "past_key_values.32.key", + "past_key_values.32.value", + "past_key_values.33.key", + "past_key_values.33.value" + ], + "outputs": [ + "present.0.key", + "present.0.value", + "present.1.key", + "present.1.value", + "present.2.key", + "present.2.value", + "present.3.key", + "present.3.value", + "present.4.key", + "present.4.value", + "present.5.key", + "present.5.value", + "present.6.key", + "present.6.value", + "present.7.key", + "present.7.value", + "present.8.key", + "present.8.value", + "present.9.key", + "present.9.value", + "present.10.key", + "present.10.value", + "present.11.key", + "present.11.value", + "present.12.key", + "present.12.value", + "present.13.key", + "present.13.value", + "present.14.key", + "present.14.value", + "present.15.key", + "present.15.value", + "present.16.key", + "present.16.value", + "present.17.key", + "present.17.value", + "present.18.key", + "present.18.value", + "present.19.key", + "present.19.value", + "present.20.key", + "present.20.value", + "present.21.key", + "present.21.value", + "present.22.key", + "present.22.value", + "present.23.key", + "present.23.value", + "present.24.key", + "present.24.value", + "present.25.key", + "present.25.value", + "present.26.key", + "present.26.value", + "present.27.key", + "present.27.value", + "present.28.key", + "present.28.value", + "present.29.key", + "present.29.value", + "present.30.key", + "present.30.value", + "present.31.key", + "present.31.value", + "present.32.key", + "present.32.value", + "present.33.key", + "present.33.value", + "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0" + ], + "run_on_token_gen": false + }, + "iterator": { + "filename": "iterator.onnx", + "inputs": [ + "/model/embed_tokens/Mul/output_0", + "past_key_values.0.key", + "past_key_values.0.value", + "past_seq_len", + "total_seq_len", + "past_key_values.1.key", + "past_key_values.1.value", + "past_key_values.2.key", + "past_key_values.2.value", + "past_key_values.3.key", + "past_key_values.3.value", + "past_key_values.4.key", + "past_key_values.4.value", + "past_key_values.5.key", + "past_key_values.5.value", + "past_key_values.6.key", + "past_key_values.6.value", + "past_key_values.7.key", + "past_key_values.7.value", + "past_key_values.8.key", + "past_key_values.8.value", + "past_key_values.9.key", + "past_key_values.9.value", + "past_key_values.10.key", + "past_key_values.10.value", + "past_key_values.11.key", + "past_key_values.11.value", + "past_key_values.12.key", + "past_key_values.12.value", + "past_key_values.13.key", + "past_key_values.13.value", + "past_key_values.14.key", + "past_key_values.14.value", + "past_key_values.15.key", + "past_key_values.15.value", + "past_key_values.16.key", + "past_key_values.16.value", + "past_key_values.17.key", + "past_key_values.17.value", + "past_key_values.18.key", + "past_key_values.18.value", + "past_key_values.19.key", + "past_key_values.19.value", + "past_key_values.20.key", + "past_key_values.20.value", + "past_key_values.21.key", + "past_key_values.21.value", + "past_key_values.22.key", + "past_key_values.22.value", + "past_key_values.23.key", + "past_key_values.23.value", + "past_key_values.24.key", + "past_key_values.24.value", + "past_key_values.25.key", + "past_key_values.25.value", + "past_key_values.26.key", + "past_key_values.26.value", + "past_key_values.27.key", + "past_key_values.27.value", + "past_key_values.28.key", + "past_key_values.28.value", + "past_key_values.29.key", + "past_key_values.29.value", + "past_key_values.30.key", + "past_key_values.30.value", + "past_key_values.31.key", + "past_key_values.31.value", + "past_key_values.32.key", + "past_key_values.32.value", + "past_key_values.33.key", + "past_key_values.33.value" + ], + "outputs": [ + "present.0.key", + "present.0.value", + "present.1.key", + "present.1.value", + "present.2.key", + "present.2.value", + "present.3.key", + "present.3.value", + "present.4.key", + "present.4.value", + "present.5.key", + "present.5.value", + "present.6.key", + "present.6.value", + "present.7.key", + "present.7.value", + "present.8.key", + "present.8.value", + "present.9.key", + "present.9.value", + "present.10.key", + "present.10.value", + "present.11.key", + "present.11.value", + "present.12.key", + "present.12.value", + "present.13.key", + "present.13.value", + "present.14.key", + "present.14.value", + "present.15.key", + "present.15.value", + "present.16.key", + "present.16.value", + "present.17.key", + "present.17.value", + "present.18.key", + "present.18.value", + "present.19.key", + "present.19.value", + "present.20.key", + "present.20.value", + "present.21.key", + "present.21.value", + "present.22.key", + "present.22.value", + "present.23.key", + "present.23.value", + "present.24.key", + "present.24.value", + "present.25.key", + "present.25.value", + "present.26.key", + "present.26.value", + "present.27.key", + "present.27.value", + "present.28.key", + "present.28.value", + "present.29.key", + "present.29.value", + "present.30.key", + "present.30.value", + "present.31.key", + "present.31.value", + "present.32.key", + "present.32.value", + "present.33.key", + "present.33.value", + "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0" + ], + "run_on_prompt": false + }, + "lm_head": { + "filename": "lm_head.onnx", + "inputs": [ + "/model/layers.34/final_norm_layernorm/SkipLayerNorm_Mul_output_0" + ], + "outputs": [ + "logits" + ] + } + } + ] + }, + "embedding": { + "filename": "embed_quant.onnx", + "inputs": { + "input_ids": "input_ids", + "image_features": "image_features" + }, + "outputs": { + "inputs_embeds": "/model/embed_tokens/Mul/output_0" + } + }, + "vision": { + "filename": "model_ctx_vision.onnx", + "inputs": { + "pixel_values": "pixel_values" + }, + "outputs": { + "image_features": "image_features" + }, + "session_options": { + "intra_op_num_threads": 2, + "inter_op_num_threads": 1, + "provider_options": [ + { + "qnn": { + "htp_performance_mode": "burst", + "htp_graph_finalization_optimization_mode": "3", + "soc_model": "60" + } + } + ] + } + }, + "eos_token_id": [ + 1, + 106 + ], + "pad_token_id": 0, + "type": "gemma3", + "vocab_size": 262208 + }, + "search": { + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": true, + "length_penalty": 1.0, + "max_length": 131072, + "min_length": 0, + "no_repeat_ngram_size": 0, + "num_beams": 1, + "num_return_sequences": 1, + "past_present_share_buffer": true, + "repetition_penalty": 1.0, + "temperature": 1.0, + "top_k": 64, + "top_p": 0.95 + } +} diff --git a/google-gemma-Gemma3-4B/qnn-cpu/genai/processor_config.json b/google-gemma-Gemma3-4B/qnn-cpu/genai/processor_config.json new file mode 100644 index 00000000..b25059aa --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/genai/processor_config.json @@ -0,0 +1,24 @@ +{ + "processor": { + "name": "gemma_3_image_processing", + "transforms": [ + { "operation": { "name": "decode_image", "type": "DecodeImage", "attrs": { "color_space": "RGB" } } }, + { + "operation": { + "name": "resize", + "type": "Resize", + "attrs": { "interpolation": "CUBIC", "width": 896, "height": 896, "keep_aspect_ratio": 0 } + } + }, + { "operation": { "name": "re-scale", "type": "Rescale" } }, + { + "operation": { + "name": "normalize", + "type": "Normalize", + "attrs": { "mean": [ 0.5, 0.5, 0.5 ], "std": [ 0.5, 0.5, 0.5 ] } + } + }, + { "operation": { "name": "to_channel_first", "type": "Permute3D", "attrs": { "dims": [ 2, 0, 1 ] } } } + ] + } +} diff --git a/google-gemma-Gemma3-4B/qnn-cpu/gptqmodel_int8.patch b/google-gemma-Gemma3-4B/qnn-cpu/gptqmodel_int8.patch new file mode 100644 index 00000000..83cdd399 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/gptqmodel_int8.patch @@ -0,0 +1,40 @@ +--- "model (1).py" 2025-11-21 20:34:23.711972000 -0800 ++++ model.py 2025-11-21 20:34:23.750922000 -0800 +@@ -509,6 +509,11 @@ + return convert_gptq_v2_to_v1_format(model, quantize_config, qlinear_kernel), True + else: + return model, False ++def get_dynamic_bits(name, global_bits, overrides): ++ for pattern, config in overrides.items(): ++ if re.match(pattern.removeprefix("+:"), name): ++ return config.get("bits", global_bits) ++ return global_bits + + def convert_gptq_v2_to_v1_format_module( + module: BaseQuantLinear, +@@ -517,10 +522,10 @@ + assert isinstance(module, BaseQuantLinear) + + log.info.once("Format: Converting GPTQ v2 to v1") +- +- if quantize_config.bits == 2: ++ bits = quantize_config.bits if quantize_config.dynamic is None else get_dynamic_bits(module.name, quantize_config.bits, quantize_config.dynamic) ++ if bits == 2: + module.qzeros.data -= 0b01010101010101010101010101010101 +- elif quantize_config.bits == 3: ++ elif bits == 3: + module.qzeros.data[:, range(0, module.qzeros.data.shape[1], 3)] -= ( + 0b00100100100100100100100100100100 + ) +@@ -530,9 +535,9 @@ + module.qzeros.data[:, range(2, module.qzeros.data.shape[1], 3)] -= ( + 0b01001001001001001001001001001001 + ) +- elif quantize_config.bits == 4: ++ elif bits == 4: + module.qzeros.data -= 0b00010001000100010001000100010001 +- elif quantize_config.bits == 8: ++ elif bits == 8: + module.qzeros.data -= 0b00000001000000010000000100000001 + else: + raise NotImplementedError("Only 2,3,4,8 bits are supported.") diff --git a/google-gemma-Gemma3-4B/qnn-cpu/oga_patch1.patch b/google-gemma-Gemma3-4B/qnn-cpu/oga_patch1.patch new file mode 100644 index 00000000..95d274ce --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/oga_patch1.patch @@ -0,0 +1,27 @@ +--- buildeer_orig.py 2025-11-21 22:56:02.640117000 -0800 ++++ builder.py 2025-11-21 22:58:33.933287000 -0800 +@@ -1898,8 +1898,14 @@ + # Unpack attention weights if needed + self.make_attention_unpacked(layer_id, attention, root_input, **kwargs) + ++ # Get dtype used for MatMul ops ++ q_dtype = getattr(attention.q_proj, "weight", getattr(attention.q_proj, "bits", None)) ++ k_dtype = getattr(attention.k_proj, "weight", getattr(attention.k_proj, "bits", None)) ++ v_dtype = getattr(attention.v_proj, "weight", getattr(attention.v_proj, "bits", None)) ++ qkv_dtype_equal = getattr(q_dtype, "dtype", q_dtype) == getattr(k_dtype, "dtype", k_dtype) == getattr(v_dtype, "dtype", v_dtype) ++ + # Make MatMul nodes +- if self.attention_attrs["use_packed_matmul"]: ++ if self.attention_attrs["use_packed_matmul"] and qkv_dtype_equal: + # Combine 3 MatMuls into 1 packed MatMul + qkv_matmul_basename = f"/model/layers.{layer_id}/attn/qkv_proj/MatMul" + qkv_matmul_name = self.make_packed_matmul(attention.q_proj, attention.k_proj, attention.v_proj, qkv_matmul_basename, root_input) +@@ -1921,7 +1927,7 @@ + v_bias_exists = attention.v_proj.bias is not None and torch.count_nonzero(attention.v_proj.bias) > 0 + all_bias_exists = q_bias_exists and k_bias_exists and v_bias_exists + +- if all_bias_exists and self.attention_attrs["use_packed_matmul"]: ++ if all_bias_exists and self.attention_attrs["use_packed_matmul"] and qkv_dtype_equal: + # Combine 3 Adds into 1 packed Add + qkv_add_name = f"/model/layers.{layer_id}/attn/qkv_proj/Add" + self.make_packed_add(attention.q_proj.bias, attention.k_proj.bias, attention.v_proj.bias, qkv_add_name, root_input=self.attention_attrs["q_path"]) diff --git a/google-gemma-Gemma3-4B/qnn-cpu/oga_patch2.patch b/google-gemma-Gemma3-4B/qnn-cpu/oga_patch2.patch new file mode 100644 index 00000000..60d8033d --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/oga_patch2.patch @@ -0,0 +1,26 @@ +--- quantized_model_orig.py 2025-11-21 22:56:22.644060000 -0800 ++++ quantized_model.py 2025-11-21 22:59:41.067178000 -0800 +@@ -863,6 +863,23 @@ + self.pack_qzeros(temp_module) + module.qzeros = temp_module.qzeros + ++ def _load_quant_config(self, quant_attrs): ++ super()._load_quant_config(quant_attrs) ++ self.overrides = quant_attrs["config"].get("dynamic", {}) ++ ++ def get_overrides(self, layer_name): ++ for pattern, overrides in self.overrides.items(): ++ if re.match(pattern.removeprefix("+:"), layer_name): ++ return overrides ++ return {} ++ ++ def get_layer_bits(self, layer_name): ++ return self.get_overrides(layer_name).get("bits", self.global_bits) ++ ++ def get_layer_group_size(self, layer_name): ++ return self.get_overrides(layer_name).get("group_size", self.global_group_size) ++ ++ + class QuarkModel(QuantizedModel): + def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, intermediate_size, num_layers): + super().__init__(quant_type, input_path, quant_attrs, q_size, kv_size, intermediate_size, num_layers) diff --git a/google-gemma-Gemma3-4B/qnn-cpu/qnn_req.txt b/google-gemma-Gemma3-4B/qnn-cpu/qnn_req.txt new file mode 100644 index 00000000..05c84579 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/qnn_req.txt @@ -0,0 +1,7 @@ +coloredlogs +flatbuffers +numpy >= 1.21.6 +packaging +protobuf +sympy +transformers==4.55.2 diff --git a/google-gemma-Gemma3-4B/qnn-cpu/requirements.txt b/google-gemma-Gemma3-4B/qnn-cpu/requirements.txt new file mode 100644 index 00000000..09f834f0 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/requirements.txt @@ -0,0 +1,12 @@ +datasets +onnx==1.16.2 +onnx-ir==0.1.4 +onnxruntime-genai-cuda==0.9.0 +onnxruntime-gpu==1.22.0 +onnxscript +optimum +setuptools +tabulate +tiktoken +tokenizers +transformers==4.52.3 diff --git a/google-gemma-Gemma3-4B/qnn-cpu/share_data.py b/google-gemma-Gemma3-4B/qnn-cpu/share_data.py new file mode 100644 index 00000000..2bd6b370 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/share_data.py @@ -0,0 +1,22 @@ +import os +import onnx + +ctx_path = r'models/gemma3_qnn/model/context.onnx' +itr_path = r'models/gemma3_qnn/model/iterator.onnx' + +cm = onnx.load(ctx_path, load_external_data=False) +im = onnx.load(itr_path, load_external_data=False) + +for ci, ii in zip(cm.graph.initializer, im.graph.initializer): + if ci.name != ii.name: + print(f'initializer are not same {ci.name} <=> {ii.name}') + break + c_loc_idx = None + i_loc_idx = None + if ci.external_data[0].key != 'location' or ii.external_data[0].key != 'location': + print(f'unexpeted mismatch in external data') + continue + if (ci.external_data[1] == ii.external_data[1] and ci.external_data[2] == ci.external_data[2]): + ii.external_data[0].value = ci.external_data[0].value + +onnx.save(im, itr_path) diff --git a/google-gemma-Gemma3-4B/qnn-cpu/vision_allowzero.py b/google-gemma-Gemma3-4B/qnn-cpu/vision_allowzero.py new file mode 100644 index 00000000..d3363de8 --- /dev/null +++ b/google-gemma-Gemma3-4B/qnn-cpu/vision_allowzero.py @@ -0,0 +1,14 @@ +import os +import onnx + +mp = r'models/gemma-3-4b-it-vis-onnx/model.onnx' +op = r'models/gemma-3-4b-it-vis-onnx/model_updated.onnx' + +m = onnx.load(mp) +for node in m.graph.node: + del node.metadata_props[:] + if node.op_type == 'Reshape': + for attr in node.attribute: + if attr.name == 'allowzero': + attr.i = 0 +onnx.save(m, op)