Squashed commit of the following:

commit ddbcb18 Merge: ba5547f 69c623b Author: Xavier(Tianhao) Chi <[email protected]> Date: Mon Jun 17 21:39:57 2024 -0400 Merge branch 'main' of https://github.com/MbodiAI/mbodied-agents commit ba5547f Author: Xavier(Tianhao) Chi <[email protected]> Date: Mon Jun 17 16:05:01 2024 -0400 Update example openvla server and fix dependencies commit 69c623b Author: Xavier(Tianhao) Chi <[email protected]> Date: Mon Jun 17 21:39:18 2024 -0400 fix dependencies commit 9263d29 Author: Xavier Chi <[email protected]> Date: Mon Jun 17 21:37:43 2024 -0400 Fix dependencies (#24) * Update example openvla server. * fix dependencies commit 3b7d857 Author: Xavier(Tianhao) Chi <[email protected]> Date: Mon Jun 17 16:05:01 2024 -0400 Update example openvla server.
mbodiai · Jun 18, 2024 · bacecb7 · bacecb7
1 parent 1a6af9c
commit bacecb7
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 7 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,7 @@ dependencies = [
   "anthropic",
   "backoff",
   "pydantic",
-  "pydantic_numpy",
+  "pydantic_numpy==5.0.2",
   "torch",
   "h5py",
   "click",

diff --git a/src/mbodied_agents/agents/motion/openvla_example_server.py b/src/mbodied_agents/agents/motion/openvla_example_server.py
@@ -3,7 +3,7 @@
 from PIL import Image
 import torch
 from io import BytesIO
-
+import base64
 
 class OpenVLAInterface:
     """This class encapsulates the OpenVLA Agent's capabilities for remote action prediction."""
@@ -19,9 +19,18 @@ def __init__(self, model_name="openvla/openvla-7b", device="cuda"):
             trust_remote_code=True,
         ).to(device)
 
-    def predict_action(self, image_base64, instruction, unnorm_key=None):
+    def predict_action(self, image_base64, instruction, unnorm_key=None, image_path=None):
         """Predicts the robot's action based on the provided image and instruction."""
-        image = Image.open(BytesIO(image_base64))
+        if image_base64:
+            # Assume it's a base64 image
+            image = Image.open(BytesIO(base64.b64decode(image_base64)))
+        elif image_path:
+            # Assume it's an uploaded image
+            image = Image.open(image_path)
+        else:
+            raise ValueError(
+                "Either an uploaded image or a base64 image must be provided.")
+
         prompt = f"In: What action should the robot take to {instruction}?\nOut:"
         inputs = self.processor(prompt, image).to("cuda", dtype=torch.bfloat16)
         action = self.model.predict_action(
@@ -35,13 +44,14 @@ def create_interface():
     gr_interface = gr.Interface(
         fn=vla_interface.predict_action,
         inputs=[
-            gr.Textbox(label="Base64 Image"),
+            gr.Textbox(label="Base64 Image (using API) or upload image below.", visible=False),
             gr.Textbox(label="Instruction"),
-            gr.Textbox(label="Unnorm Key"),
+            gr.Textbox(label="Unnorm Key", placeholder="bridge_orig"),
+            gr.Image(label="Upload Image", type="filepath"),
         ],
         outputs=gr.Textbox(label="Robot Action"),
         title="OpenVLA Robot Action Prediction",
-        description="Provide an image of the robot workspace and an instruction to predict the robot's action. You can either upload an image or provide a base64-encoded image."
+        description="Provide an image of the robot workspace and an instruction to predict the robot's action. You can either upload an image or provide a base64-encoded image with API."
     )
     return gr_interface