Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit ddbcb18
Merge: ba5547f 69c623b
Author: Xavier(Tianhao) Chi <[email protected]>
Date:   Mon Jun 17 21:39:57 2024 -0400

    Merge branch 'main' of https://github.com/MbodiAI/mbodied-agents

commit ba5547f
Author: Xavier(Tianhao) Chi <[email protected]>
Date:   Mon Jun 17 16:05:01 2024 -0400

    Update example openvla server and fix dependencies

commit 69c623b
Author: Xavier(Tianhao) Chi <[email protected]>
Date:   Mon Jun 17 21:39:18 2024 -0400

    fix dependencies

commit 9263d29
Author: Xavier Chi <[email protected]>
Date:   Mon Jun 17 21:37:43 2024 -0400

    Fix dependencies (#24)

    * Update example openvla server.

    * fix dependencies

commit 3b7d857
Author: Xavier(Tianhao) Chi <[email protected]>
Date:   Mon Jun 17 16:05:01 2024 -0400

    Update example openvla server.
  • Loading branch information
nqyy committed Jun 18, 2024
1 parent 1a6af9c commit bacecb7
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 7 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ dependencies = [
"anthropic",
"backoff",
"pydantic",
"pydantic_numpy",
"pydantic_numpy==5.0.2",
"torch",
"h5py",
"click",
Expand Down
22 changes: 16 additions & 6 deletions src/mbodied_agents/agents/motion/openvla_example_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from PIL import Image
import torch
from io import BytesIO

import base64

class OpenVLAInterface:
"""This class encapsulates the OpenVLA Agent's capabilities for remote action prediction."""
Expand All @@ -19,9 +19,18 @@ def __init__(self, model_name="openvla/openvla-7b", device="cuda"):
trust_remote_code=True,
).to(device)

def predict_action(self, image_base64, instruction, unnorm_key=None):
def predict_action(self, image_base64, instruction, unnorm_key=None, image_path=None):
"""Predicts the robot's action based on the provided image and instruction."""
image = Image.open(BytesIO(image_base64))
if image_base64:
# Assume it's a base64 image
image = Image.open(BytesIO(base64.b64decode(image_base64)))
elif image_path:
# Assume it's an uploaded image
image = Image.open(image_path)
else:
raise ValueError(
"Either an uploaded image or a base64 image must be provided.")

prompt = f"In: What action should the robot take to {instruction}?\nOut:"
inputs = self.processor(prompt, image).to("cuda", dtype=torch.bfloat16)
action = self.model.predict_action(
Expand All @@ -35,13 +44,14 @@ def create_interface():
gr_interface = gr.Interface(
fn=vla_interface.predict_action,
inputs=[
gr.Textbox(label="Base64 Image"),
gr.Textbox(label="Base64 Image (using API) or upload image below.", visible=False),
gr.Textbox(label="Instruction"),
gr.Textbox(label="Unnorm Key"),
gr.Textbox(label="Unnorm Key", placeholder="bridge_orig"),
gr.Image(label="Upload Image", type="filepath"),
],
outputs=gr.Textbox(label="Robot Action"),
title="OpenVLA Robot Action Prediction",
description="Provide an image of the robot workspace and an instruction to predict the robot's action. You can either upload an image or provide a base64-encoded image."
description="Provide an image of the robot workspace and an instruction to predict the robot's action. You can either upload an image or provide a base64-encoded image with API."
)
return gr_interface

Expand Down

0 comments on commit bacecb7

Please sign in to comment.