Skip to content

部署效果跟huggingface上demo效果有差距 #81

@jbfhx

Description

@jbfhx

我尝试自己部署了showui-2b模型,但效果似乎不如huggingface上的demo演示效果,能否抽空解答我的疑问?感谢。
以下是测试结果(采用同样的提示词:”点击我的“):
DEMO RESULT(结果正确):
Image
MY RESULT(结果错误):
Image

以下是我的推理代码:

import torch
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from PIL import Image, ImageDraw, ImageFont
from utils import draw_action_on_image_xy
import ast
import time
import json

class ShowUI:
    def __init__(self):
        model_path = "./weights/showui-2b"
        start_time = time.time()
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,
            device_map={"": 0}
        )

        self.min_pixels = 256*28*28
        self.max_pixels = 1344*28*28

        self.processor = AutoProcessor.from_pretrained(model_path, min_pixels=self.min_pixels, max_pixels=self.max_pixels)
        load_time = time.time() - start_time
        print('-------------------------')
        print(f"model load time: {load_time:.2f} s")

        self.nav_system = """You are an assistant trained to navigate the {_APP} screen. 
        Given a task instruction and a screen observation, output the executable action.         
        Here is the action space:
        {_ACTION_SPACE}
        """

        self.nav_format = """
        Format the action as a dictionary with the following keys:
        {"action": "ACTION_TYPE", "value": "element", "position": [x,y]}
        If value or position is not applicable, set it as "null".
        Position represents the relative coordinates on the screenshot and should be scaled to a range of 0-1.
        """

        self.action_map = {
        'web': """
        1. `CLICK`: Click on an element, value is not applicable and the position [x,y] is required. 
        2. `INPUT`: Type a string into an element, value is a string to type and the position [x,y] is required. 
        3. `SELECT`: Select a value for an element, value is not applicable and the position [x,y] is required. 
        4. `HOVER`: Hover on an element, value is not applicable and the position [x,y] is required.
        5. `ANSWER`: Answer the question, value is the answer and the position is not applicable.
        6. `ENTER`: Enter operation, value and position are not applicable.
        7. `SCROLL`: Scroll the screen, value is the direction to scroll and the position is not applicable.
        8. `SELECT_TEXT`: Select some text content, value is not applicable and position [[x1,y1], [x2,y2]] is the start and end position of the select operation.
        9. `COPY`: Copy the text, value is the text to copy and the position is not applicable.
        10. 'IDENTIFY': Identify the text, value is the text to identify and the position [[x1,y1], [x2,y2]] is the start and end position of the element.
        11. 'ASSERTION': Assert an element exists, value is not applicable and the position [x,y] is the coordinates of the element.
        """,

        'phone': """
        1. `INPUT`: Type a string into an element, value is not applicable and the position [x,y] is required. 
        2. `SWIPE`: Swipe the screen, value is not applicable and the position [[x1,y1], [x2,y2]] is the start and end position of the swipe operation.
        3. `TAP`: Tap on an element, value is not applicable and the position [x,y] is required.
        4. `ANSWER`: Answer the question, value is the status (e.g., 'task complete') and the position is not applicable.
        5. `ENTER`: Enter operation, value and position are not applicable.
        6. 'ASSERTION': Assert an element exists, value is not applicable and the position [x,y] is the coordinates of the element.
        """
        }


    def infer(self, img_url, query, split):
        system_prompt = self.nav_system.format(_APP=split, _ACTION_SPACE=self.action_map[split]) + self.nav_format
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": system_prompt},
                    {"type": "text", "text": f'Task: {query}'},
                    {"type": "image", "image": img_url, "min_pixels": self.min_pixels, "max_pixels": self.max_pixels},
                ],
            }
        ]

        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
        inputs = inputs.to("cuda")
    
        generated_ids = self.model.generate(**inputs, max_new_tokens=512)
        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
        outputs = self.processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
        return outputs

    def cleanup(self):
        torch.cuda.empty_cache()``

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions