forked from microsoft/OmniParser
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6cd06a7
Showing
17 changed files
with
2,091 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# OmniParser: Screen Parsing tool for Pure Vision Based GUI Agent | ||
|
||
![Logo](imgs/logo.png) | ||
[![arXiv](https://img.shields.io/badge/Paper-green)](https://arxiv.org/abs/2408.00203) | ||
[![License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) | ||
|
||
**OmniParser** is a comprehensive method for parsing user interface screenshots into structured and easy-to-understand elements, which significantly enhances the ability of GPT-4V to generate actions that can be accurately grounded in the corresponding regions of the interface. | ||
|
||
## Examples: | ||
We put together a few simple examples in the demo.ipynb. | ||
|
||
## Gradio Demo | ||
To run gradio demo, simply run: | ||
```python | ||
python gradion_demo.py | ||
``` | ||
|
||
|
||
## 📚 Citation | ||
Our technical report can be found [here](https://arxiv.org/abs/2408.00203). | ||
If you find our work useful, please consider citing our work: | ||
``` | ||
@misc{lu2024omniparserpurevisionbased, | ||
title={OmniParser for Pure Vision Based GUI Agent}, | ||
author={Yadong Lu and Jianwei Yang and Yelong Shen and Ahmed Awadallah}, | ||
year={2024}, | ||
eprint={2408.00203}, | ||
archivePrefix={arXiv}, | ||
primaryClass={cs.CV}, | ||
url={https://arxiv.org/abs/2408.00203}, | ||
} | ||
``` |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
from typing import Optional | ||
|
||
import gradio as gr | ||
import numpy as np | ||
import torch | ||
from PIL import Image | ||
import io | ||
|
||
|
||
import base64, os | ||
from utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img | ||
import torch | ||
from PIL import Image | ||
|
||
yolo_model = get_yolo_model() | ||
caption_model_processor = get_caption_model_processor('florence', device='cuda') # 'blip2-opt-2.7b-ui', phi3v_ui florence | ||
platform = 'pc' | ||
if platform == 'pc': | ||
draw_bbox_config = { | ||
'text_scale': 0.8, | ||
'text_thickness': 2, | ||
'text_padding': 2, | ||
'thickness': 2, | ||
} | ||
BOX_TRESHOLD = 0.05 | ||
elif platform == 'web': | ||
draw_bbox_config = { | ||
'text_scale': 0.8, | ||
'text_thickness': 2, | ||
'text_padding': 3, | ||
'thickness': 3, | ||
} | ||
BOX_TRESHOLD = 0.05 | ||
elif platform == 'mobile': | ||
draw_bbox_config = { | ||
'text_scale': 0.8, | ||
'text_thickness': 2, | ||
'text_padding': 3, | ||
'thickness': 3, | ||
} | ||
BOX_TRESHOLD = 0.05 | ||
|
||
|
||
|
||
MARKDOWN = """ | ||
# OmniParser for Pure Vision Based General GUI Agent 🔥 | ||
<div> | ||
<a href="https://arxiv.org/pdf/2408.00203"> | ||
<img src="https://img.shields.io/badge/arXiv-2408.00203-b31b1b.svg" alt="Arxiv" style="display:inline-block;"> | ||
</a> | ||
</div> | ||
OmniParser is a screen parsing tool to convert general GUI screen to structured elements. **Trained models will be released soon** | ||
""" | ||
|
||
DEVICE = torch.device('cuda') | ||
|
||
# @spaces.GPU | ||
# @torch.inference_mode() | ||
# @torch.autocast(device_type="cuda", dtype=torch.bfloat16) | ||
def process( | ||
image_input, | ||
prompt: str = None | ||
) -> Optional[Image.Image]: | ||
|
||
image_path = "/home/yadonglu/sandbox/data/omniparser_demo/image_input.png" | ||
image_input.save(image_path) | ||
# import pdb; pdb.set_trace() | ||
|
||
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}) | ||
text, ocr_bbox = ocr_bbox_rslt | ||
print('prompt:', prompt) | ||
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, yolo_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=0.3,prompt=prompt) | ||
image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) | ||
print('finish processing') | ||
parsed_content_list = '\n'.join(parsed_content_list) | ||
return image, str(parsed_content_list) | ||
|
||
|
||
|
||
with gr.Blocks() as demo: | ||
gr.Markdown(MARKDOWN) | ||
with gr.Row(): | ||
with gr.Column(): | ||
image_input_component = gr.Image( | ||
type='pil', label='Upload image') | ||
prompt_input_component = gr.Textbox(label='Prompt', placeholder='') | ||
submit_button_component = gr.Button( | ||
value='Submit', variant='primary') | ||
with gr.Column(): | ||
image_output_component = gr.Image(type='pil', label='Image Output') | ||
text_output_component = gr.Textbox(label='Parsed screen elements', placeholder='Text Output') | ||
|
||
submit_button_component.click( | ||
fn=process, | ||
inputs=[ | ||
image_input_component, | ||
prompt_input_component, | ||
], | ||
outputs=[image_output_component, text_output_component] | ||
) | ||
|
||
# demo.launch(debug=False, show_error=True, share=True) | ||
demo.launch(share=True, server_port=7861, server_name='0.0.0.0') |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from utils import get_som_labeled_img, check_ocr_box, get_caption_model_processor, get_dino_model, get_yolo_model | ||
import torch | ||
from ultralytics import YOLO | ||
from PIL import Image | ||
from typing import Dict, Tuple, List | ||
import io | ||
import base64 | ||
|
||
|
||
config = { | ||
'som_model_path': 'finetuned_icon_detect.pt', | ||
'device': 'cpu', | ||
'caption_model_path': 'Salesforce/blip2-opt-2.7b', | ||
'draw_bbox_config': { | ||
'text_scale': 0.8, | ||
'text_thickness': 2, | ||
'text_padding': 3, | ||
'thickness': 3, | ||
}, | ||
'BOX_TRESHOLD': 0.05 | ||
} | ||
|
||
|
||
class Omniparser(object): | ||
def __init__(self, config: Dict): | ||
self.config = config | ||
|
||
self.som_model = get_yolo_model(model_path=config['som_model_path']) | ||
# self.caption_model_processor = get_caption_model_processor(config['caption_model_path'], device=cofig['device']) | ||
# self.caption_model_processor['model'].to(torch.float32) | ||
|
||
def parse(self, image_path: str): | ||
print('Parsing image:', image_path) | ||
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}) | ||
text, ocr_bbox = ocr_bbox_rslt | ||
|
||
draw_bbox_config = self.config['draw_bbox_config'] | ||
BOX_TRESHOLD = self.config['BOX_TRESHOLD'] | ||
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_path, self.som_model, BOX_TRESHOLD = BOX_TRESHOLD, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=None, ocr_text=text,use_local_semantics=False) | ||
|
||
image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) | ||
# formating output | ||
return_list = [{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]}, | ||
'text': parsed_content_list[i].split(': ')[1], 'type':'text'} for i, (k, coord) in enumerate(label_coordinates.items()) if i < len(parsed_content_list)] | ||
return_list.extend( | ||
[{'from': 'omniparser', 'shape': {'x':coord[0], 'y':coord[1], 'width':coord[2], 'height':coord[3]}, | ||
'text': 'None', 'type':'icon'} for i, (k, coord) in enumerate(label_coordinates.items()) if i >= len(parsed_content_list)] | ||
) | ||
|
||
return [image, return_list] | ||
|
||
parser = Omniparser(config) | ||
image_path = 'examples/pc_1.png' | ||
|
||
# time the parser | ||
import time | ||
s = time.time() | ||
image, parsed_content_list = parser.parse(image_path) | ||
device = config['device'] | ||
print(f'Time taken for Omniparser on {device}:', time.time() - s) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
torch==2.2.2 | ||
easyocr==1.7.1 | ||
torchvision==0.17.2 | ||
supervision==0.18.0 | ||
openai==1.3.5 | ||
transformers==4.40.2 | ||
ultralytics==8.1.24 | ||
azure-identity | ||
numpy | ||
opencv-python==4.8.1.78 | ||
opencv-python-headless==4.8.0.74 | ||
supervision==0.18.0 | ||
gradio==4.40.0 | ||
|
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.