Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions QEfficient/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,26 @@
#
# -----------------------------------------------------------------------------

from QEfficient.base.common import QEFFCommonLoader # noqa: F401

Check failure on line 8 in QEfficient/base/__init__.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (I001)

QEfficient/base/__init__.py:8:1: I001 Import block is un-sorted or un-formatted
from QEfficient.transformers.models.modeling_auto import ( # noqa: F401
QEFFAutoModel,
QEFFAutoModelForCausalLM,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSpeechSeq2Seq,
)
# from QEfficient.transformers.models.modeling_auto import ( # noqa: F401
# QEFFAutoModel,
# QEFFAutoModelForCausalLM,
# QEFFAutoModelForImageTextToText,
# QEFFAutoModelForSpeechSeq2Seq,
# )


# __init__.py
def get_qeff_models():
from QEfficient.transformers.models.modeling_auto import (
QEFFAutoModel,
QEFFAutoModelForCausalLM,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSpeechSeq2Seq,
)
return {
"QEFFAutoModel": QEFFAutoModel,
"QEFFAutoModelForCausalLM": QEFFAutoModelForCausalLM,
"QEFFAutoModelForImageTextToText": QEFFAutoModelForImageTextToText,
"QEFFAutoModelForSpeechSeq2Seq": QEFFAutoModelForSpeechSeq2Seq,
}
3 changes: 2 additions & 1 deletion QEfficient/base/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
"""

import os
from typing import Any

from transformers import AutoConfig
import importlib

from QEfficient.base.modeling_qeff import QEFFBaseModel
from QEfficient.transformers.modeling_utils import EXTERNAL_MODEL_CLASS_MAPPING, MODEL_CLASS_MAPPING
from QEfficient.utils import login_and_download_hf_lm

Check failure on line 23 in QEfficient/base/common.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (I001)

QEfficient/base/common.py:15:1: I001 Import block is un-sorted or un-formatted


class QEFFCommonLoader:
Expand Down Expand Up @@ -47,7 +48,7 @@
or EXTERNAL_MODEL_CLASS_MAPPING[config.__class__.__name__]
)
if class_name:
module = __import__("QEfficient.transformers.models.modeling_auto")
module = importlib.import_module("QEfficient.transformers.models.modeling_auto")
model_class = getattr(module, class_name)
else:
raise NotImplementedError(
Expand Down
208 changes: 105 additions & 103 deletions QEfficient/generation/cloud_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,33 @@

import numpy as np

try:
import qaicrt
except ImportError:
import platform
import sys
# try:
# import qaicrt
# except ImportError:
# import platform
# import sys

sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
import qaicrt
# sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
# import qaicrt

try:
import QAicApi_pb2 as aicapi
except ImportError:
import sys
# try:
# import QAicApi_pb2 as aicapi
# except ImportError:
# import sys

sys.path.append("/opt/qti-aic/dev/python")
import QAicApi_pb2 as aicapi
# sys.path.append("/opt/qti-aic/dev/python")
# import QAicApi_pb2 as aicapi

aic_to_np_dtype_mapping = {
aicapi.FLOAT_TYPE: np.dtype(np.float32),
aicapi.FLOAT_16_TYPE: np.dtype(np.float16),
aicapi.INT8_Q_TYPE: np.dtype(np.int8),
aicapi.UINT8_Q_TYPE: np.dtype(np.uint8),
aicapi.INT16_Q_TYPE: np.dtype(np.int16),
aicapi.INT32_Q_TYPE: np.dtype(np.int32),
aicapi.INT32_I_TYPE: np.dtype(np.int32),
aicapi.INT64_I_TYPE: np.dtype(np.int64),
aicapi.INT8_TYPE: np.dtype(np.int8),
# aicapi.FLOAT_TYPE: np.dtype(np.float32),
# aicapi.FLOAT_16_TYPE: np.dtype(np.float16),
# aicapi.INT8_Q_TYPE: np.dtype(np.int8),
# aicapi.UINT8_Q_TYPE: np.dtype(np.uint8),
# aicapi.INT16_Q_TYPE: np.dtype(np.int16),
# aicapi.INT32_Q_TYPE: np.dtype(np.int32),
# aicapi.INT32_I_TYPE: np.dtype(np.int32),
# aicapi.INT64_I_TYPE: np.dtype(np.int64),
# aicapi.INT8_TYPE: np.dtype(np.int8),
}


Expand All @@ -58,59 +58,61 @@ def __init__(
:activate: bool. If false, activation will be disabled. Default=True.
:enable_debug_logs: bool. If True, It will enable debug logs. Default=False.
"""
# Load QPC
if device_ids is not None:
devices = qaicrt.QIDList(device_ids)
self.context = qaicrt.Context(devices)
self.queue = qaicrt.Queue(self.context, device_ids[0])
else:
self.context = qaicrt.Context()
self.queue = qaicrt.Queue(self.context, 0) # Async API
if enable_debug_logs:
if self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) != qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to setLogLevel")
qpc = qaicrt.Qpc(str(qpc_path))
# Load IO Descriptor
iodesc = aicapi.IoDesc()
status, iodesc_data = qpc.getIoDescriptor()
if status != qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to getIoDescriptor")
iodesc.ParseFromString(bytes(iodesc_data))
self.allowed_shapes = [
[(aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes]
for allowed_shape in iodesc.allowed_shapes
]
self.bindings = iodesc.selected_set.bindings
self.binding_index_map = {binding.name: binding.index for binding in self.bindings}
# Create and load Program
prog_properties = qaicrt.QAicProgramProperties()
prog_properties.SubmitRetryTimeoutMs = 60_000
if device_ids and len(device_ids) > 1:
prog_properties.devMapping = ":".join(map(str, device_ids))
self.program = qaicrt.Program(self.context, None, qpc, prog_properties)
if self.program.load() != qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to load program")
if activate:
self.activate()
# Create input qbuffers and buf_dims
self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings]
self.buf_dims = qaicrt.BufferDimensionsVecRef(
[(aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings]
)
# # Load QPC
# if device_ids is not None:
# devices = qaicrt.QIDList(device_ids)
# self.context = qaicrt.Context(devices)
# self.queue = qaicrt.Queue(self.context, device_ids[0])
# else:
# self.context = qaicrt.Context()
# self.queue = qaicrt.Queue(self.context, 0) # Async API
# if enable_debug_logs:
# if self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) != qaicrt.QStatus.QS_SUCCESS:
# raise RuntimeError("Failed to setLogLevel")
# qpc = qaicrt.Qpc(str(qpc_path))
# # Load IO Descriptor
# iodesc = aicapi.IoDesc()
# status, iodesc_data = qpc.getIoDescriptor()
# if status != qaicrt.QStatus.QS_SUCCESS:
# raise RuntimeError("Failed to getIoDescriptor")
# iodesc.ParseFromString(bytes(iodesc_data))
# self.allowed_shapes = [
# [(aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes]
# for allowed_shape in iodesc.allowed_shapes
# ]
# self.bindings = iodesc.selected_set.bindings
# self.binding_index_map = {binding.name: binding.index for binding in self.bindings}
# # Create and load Program
# prog_properties = qaicrt.QAicProgramProperties()
# prog_properties.SubmitRetryTimeoutMs = 60_000
# if device_ids and len(device_ids) > 1:
# prog_properties.devMapping = ":".join(map(str, device_ids))
# self.program = qaicrt.Program(self.context, None, qpc, prog_properties)
# if self.program.load() != qaicrt.QStatus.QS_SUCCESS:
# raise RuntimeError("Failed to load program")
# if activate:
# self.activate()
# # Create input qbuffers and buf_dims
# self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings]
# self.buf_dims = qaicrt.BufferDimensionsVecRef(
# [(aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings]
# )

@property
def input_names(self) -> List[str]:
return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_INPUT]
# return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_INPUT]
return None

@property
def output_names(self) -> List[str]:
return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_OUTPUT]
# return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_OUTPUT]
return None

def activate(self):
"""Activate qpc"""

self.program.activate()
self.execObj = qaicrt.ExecObj(self.context, self.program)
# self.execObj = qaicrt.ExecObj(self.context, self.program)

def deactivate(self):
"""Deactivate qpc"""
Expand All @@ -131,7 +133,7 @@ def set_buffers(self, buffers: Dict[str, np.ndarray]):
warn(f'Buffer: "{buffer_name}" not found')
continue
buffer_index = self.binding_index_map[buffer_name]
self.qbuffers[buffer_index] = qaicrt.QBuffer(buffer.tobytes())
# self.qbuffers[buffer_index] = qaicrt.QBuffer(buffer.tobytes())
self.buf_dims[buffer_index] = (
buffer.itemsize,
buffer.shape if len(buffer.shape) > 0 else (1,),
Expand Down Expand Up @@ -159,48 +161,48 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
"""
# Set inputs
self.set_buffers(inputs)
if self.execObj.setData(self.qbuffers, self.buf_dims) != qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to setData")
# # Run with sync API
# if self.execObj.run(self.qbuffers) != qaicrt.QStatus.QS_SUCCESS:
# Run with async API
if self.queue.enqueue(self.execObj) != qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to enqueue")
if self.execObj.waitForCompletion() != qaicrt.QStatus.QS_SUCCESS:
error_message = "Failed to run"
# Print additional error messages for unmatched dimension error
if self.allowed_shapes:
error_message += "\n\n"
error_message += '(Only if "No matching dimension found" error is present above)'
error_message += "\nAllowed shapes:"
for i, allowed_shape in enumerate(self.allowed_shapes):
error_message += f"\n{i}\n"
for binding, (elemsize, shape), (_, passed_shape) in zip(
self.bindings, allowed_shape, self.buf_dims
):
if passed_shape == [0]:
if not binding.is_partial_buf_allowed:
warn(f"Partial buffer not allowed for: {binding.name}")
continue
error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
error_message += "\n\nPassed shapes:\n"
for binding, (elemsize, shape) in zip(self.bindings, self.buf_dims):
if shape == [0]:
continue
error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
raise ValueError(error_message)
# Get output buffers
status, output_qbuffers = self.execObj.getData()
if status != qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to getData")
# if self.execObj.setData(self.qbuffers, self.buf_dims) != qaicrt.QStatus.QS_SUCCESS:
# raise MemoryError("Failed to setData")
# # # Run with sync API
# # if self.execObj.run(self.qbuffers) != qaicrt.QStatus.QS_SUCCESS:
# # Run with async API
# if self.queue.enqueue(self.execObj) != qaicrt.QStatus.QS_SUCCESS:
# raise MemoryError("Failed to enqueue")
# if self.execObj.waitForCompletion() != qaicrt.QStatus.QS_SUCCESS:
# error_message = "Failed to run"
# # Print additional error messages for unmatched dimension error
# if self.allowed_shapes:
# error_message += "\n\n"
# error_message += '(Only if "No matching dimension found" error is present above)'
# error_message += "\nAllowed shapes:"
# for i, allowed_shape in enumerate(self.allowed_shapes):
# error_message += f"\n{i}\n"
# for binding, (elemsize, shape), (_, passed_shape) in zip(
# self.bindings, allowed_shape, self.buf_dims
# ):
# if passed_shape == [0]:
# if not binding.is_partial_buf_allowed:
# warn(f"Partial buffer not allowed for: {binding.name}")
# continue
# error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
# error_message += "\n\nPassed shapes:\n"
# for binding, (elemsize, shape) in zip(self.bindings, self.buf_dims):
# if shape == [0]:
# continue
# error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
# raise ValueError(error_message)
# # Get output buffers
# status, output_qbuffers = self.execObj.getData()
# if status != qaicrt.QStatus.QS_SUCCESS:
# raise MemoryError("Failed to getData")
# Build output
outputs = {}
for output_name in self.output_names:
buffer_index = self.binding_index_map[output_name]
if self.qbuffers[buffer_index].size == 0:
continue
outputs[output_name] = np.frombuffer(
bytes(output_qbuffers[buffer_index]),
aic_to_np_dtype_mapping[self.bindings[buffer_index].type],
).reshape(self.buf_dims[buffer_index][1])
# outputs[output_name] = np.frombuffer(
# bytes(output_qbuffers[buffer_index]),
# aic_to_np_dtype_mapping[self.bindings[buffer_index].type],
# ).reshape(self.buf_dims[buffer_index][1])
return outputs
3 changes: 2 additions & 1 deletion QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,61 +5,62 @@
#
# ----------------------------------------------------------------------------

import hashlib
import warnings
from pathlib import Path
from time import perf_counter
from typing import Dict, List, Optional, Union

import numpy as np
import torch
import torch.nn as nn
from transformers import (
AutoModel,
AutoModelForCausalLM,
AutoModelForImageTextToText,
AutoModelForSpeechSeq2Seq,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
TextStreamer,
)

import QEfficient
from QEfficient.base.modeling_qeff import QEFFBaseModel

from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform
from QEfficient.generation.cloud_infer import QAICInferenceSession
from QEfficient.generation.text_generation_inference import (
CloudAI100ExecInfoNew,
PerfMetrics,
calculate_latency,
get_compilation_dims,
)
from QEfficient.transformers.modeling_utils import DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH
from QEfficient.transformers.models.pytorch_transforms import (
CustomOpsTransform,
KVCacheExternalModuleMapperTransform,
KVCacheTransform,
PoolingTransform,
SamplerTransform,
SpDTransform,
VlmKVOffloadTransform,
VlmNoKVOffloadTransform,
)
from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
from QEfficient.transformers.quantizers.quant_transforms import (
AwqToMatmulNbitsTransform,
FP8DeQuantLinearToLinearTransform,
GPTQToMatmulNbitsTransform,
)
from QEfficient.utils import (
constants,
get_padding_shape_from_config,
)
from QEfficient.utils.cache import to_hashable
from QEfficient.utils.logging_utils import logger


from QEfficient.base.modeling_qeff import QEFFBaseModel

Check failure on line 63 in QEfficient/transformers/models/modeling_auto.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (I001)

QEfficient/transformers/models/modeling_auto.py:8:1: I001 Import block is un-sorted or un-formatted
class QEFFTransformersBase(QEFFBaseModel):
"""
Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from transformers/models/modeling_auto.py file.
Expand Down
71 changes: 71 additions & 0 deletions tests/transformers/models/test_gpt2_windows.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from transformers import AutoModelForCausalLM
from QEfficient.utils import hf_download
from QEfficient.utils.constants import Constants
from QEfficient.utils.run_utils import ApiRunner
from QEfficient.utils._utils import load_hf_tokenizer
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM

Check failure on line 6 in tests/transformers/models/test_gpt2_windows.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (I001)

tests/transformers/models/test_gpt2_windows.py:1:1: I001 Import block is un-sorted or un-formatted

def load_causal_lm_model(model_config):
"""
Function to load model from huggingface and transform to KV model
--------

:model_config: Dict

:return model_hf, params
"""
model_path = hf_download(
repo_id=model_config["model_name"],
ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
)
model_hf = AutoModelForCausalLM.from_pretrained(
model_path,
use_cache=True,
attn_implementation="eager",
low_cpu_mem_usage=False,
) # Run models for single layers only
params = sum(p.numel() for p in model_hf.parameters())
model_hf.eval()
return model_hf, params

model_name = "gpt2"

model_config = {"model_name": model_name}

model_hf, _ = load_causal_lm_model(model_config)

tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
config = model_hf.config
batch_size = len(Constants.INPUT_STR)
api_runner = ApiRunner(
batch_size,
tokenizer,
config,
Constants.INPUT_STR,
Constants.PROMPT_LEN,
Constants.CTX_LEN,
)

# pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
# print("Pytorch HF tokens:", pytorch_hf_tokens)

qeff_model = QEFFAutoModelForCausalLM(model_hf, pretrained_model_name_or_path=model_name)

# pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
# print("Pytorch KV tokens:", pytorch_kv_tokens)

qpc_path = qeff_model.export()
print("qpc_path: ", qpc_path)

# qpc_path = qeff_model.compile(
# prefill_seq_len=Constants.PROMPT_LEN,
# ctx_len=Constants.CTX_LEN,
# num_cores=16,
# mxfp6_matmul=False,
# mxint8_kv_cache=False,
# num_devices=1,
# mos=1,
# aic_enable_depth_first=True,
# num_speculative_tokens=None,
# )
# print("Compiled Successfully at path: ", qpc_path)
Loading