Skip to content

Commit 4f16bfc

Browse files
website v1
1 parent fbdcafc commit 4f16bfc

19 files changed

+3471
-4
lines changed

index.html

+351-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,355 @@
11
<!DOCTYPE html>
22
<html>
3-
<body>
3+
<head>
4+
<meta charset="utf-8" />
5+
<meta
6+
name="description"
7+
content="Agent Self-Dialogue allows user input minimization"
8+
/>
9+
<meta
10+
name="keywords"
11+
content="AIUTA, VLM, LLM, Large Language Model, LLama, Vision Language Model, LLava, Embodied AI, Navigation, Uncertainty, Instance Object Navigation, Object Goal Navigation, VLM uncertainty"
12+
/>
13+
<meta name="viewport" content="width=device-width, initial-scale=1" />
14+
<title>
15+
Collaborative Instance Navigation: Leveraging Agent Self-Dialogue to
16+
Minimize User Input
17+
</title>
418

5-
<h1>tmp</h1>
19+
<link
20+
href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
21+
rel="stylesheet"
22+
/>
623

7-
</body>
8-
</html>
24+
<link rel="stylesheet" href="./static/css/bulma.min.css" />
25+
<!-- <link rel="stylesheet" href="./static/css/bulma-carousel.min.css" />
26+
<link rel="stylesheet" href="./static/css/bulma-slider.min.css" /> -->
27+
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css" />
28+
<link
29+
rel="stylesheet"
30+
href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"
31+
/>
32+
<link rel="stylesheet" href="./static/css/index.css" />
33+
<link rel="icon" href="./static/images/flaticon_coin_128.png" />
34+
35+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
36+
<script defer src="./static/js/fontawesome.all.min.js"></script>
37+
<!-- <script src="./static/js/bulma-carousel.min.js"></script>
38+
<script src="./static/js/bulma-slider.min.js"></script> -->
39+
<script src="./static/js/index.js"></script>
40+
</head>
41+
<body>
42+
<nav class="navbar" role="navigation" aria-label="main navigation">
43+
<div class="navbar-brand">
44+
<a
45+
role="button"
46+
class="navbar-burger"
47+
aria-label="menu"
48+
aria-expanded="false"
49+
>
50+
<span aria-hidden="true"></span>
51+
<span aria-hidden="true"></span>
52+
<span aria-hidden="true"></span>
53+
</a>
54+
</div>
55+
<div class="navbar-menu">
56+
<div class="navbar-start" style="flex-grow: 1; justify-content: center">
57+
<a class="navbar-item" href="https://francescotaioli.github.io/">
58+
<span class="icon">
59+
<i class="fas fa-home"></i>
60+
</span>
61+
</a>
62+
63+
<div class="navbar-item has-dropdown is-hoverable">
64+
<a class="navbar-link"> More Research </a>
65+
<div class="navbar-dropdown">
66+
<a
67+
class="navbar-item"
68+
href="https://intelligolabs.github.io/Le-RNR-Map/"
69+
>
70+
Language-enhanced RNR-Map
71+
</a>
72+
<a
73+
class="navbar-item"
74+
href="https://intelligolabs.github.io/R2RIE-CE/"
75+
>
76+
Mind the Error in VLN
77+
</a>
78+
<a
79+
class="navbar-item"
80+
href="https://intelligolabs.github.io/unsupervised_active_visual_search/"
81+
>
82+
POMP-BE-PD
83+
</a>
84+
</div>
85+
</div>
86+
</div>
87+
</div>
88+
</nav>
89+
90+
<section class="hero">
91+
<div class="hero-body">
92+
<div class="container is-max-desktop">
93+
<div class="columns is-centered">
94+
<div class="column has-text-centered">
95+
<h1 class="title is-1 publication-title">
96+
Collaborative Instance Navigation: Leveraging Agent
97+
Self-Dialogue to Minimize User Input
98+
</h1>
99+
<div class="is-size-5 publication-authors">
100+
<span class="author-block">
101+
<a href="https://francescotaioli.github.io/"
102+
>Francesco Taioli</a
103+
><sup>1,2</sup>,</span
104+
>
105+
<span class="author-block">
106+
<a
107+
href="https://scholar.google.com/citations?hl=it&user=fqdv3d4AAAAJ&view_op=list_works&sortby=pubdate"
108+
>Edoardo Zorzi</a
109+
><sup>2</sup>,
110+
</span>
111+
<span class="author-block"
112+
><a href="https://giannifranchi.github.io/">Gianni Franchi</a
113+
><sup>3</sup>,
114+
</span>
115+
<span class="author-block">
116+
<a href="https://profs.scienze.univr.it/~castellini/"
117+
>Alberto Castellini</a
118+
><sup>2</sup>,
119+
</span>
120+
<span class="author-block">
121+
<a href="http://profs.sci.univr.it/~farinelli/"
122+
>Alessandro Farinelli</a
123+
><sup>2</sup>,
124+
</span>
125+
<span class="author-block"
126+
><a href="https://www.dimi.univr.it/?ent=persona&id=218"
127+
>Marco Cristani</a
128+
><sup>2</sup>,
129+
</span>
130+
<span class="author-block"
131+
><a href="https://www.yimingwang.it/">Yiming Wang</a
132+
><sup>4</sup>
133+
</span>
134+
</div>
135+
136+
<div class="is-size-5 publication-authors">
137+
<span class="author-block"
138+
><sup>1</sup>Polytechnic of Turin,</span
139+
>
140+
<span class="author-block"
141+
><sup>2</sup>University of Verona,</span
142+
>
143+
<span class="author-block"
144+
><sup>3</sup>U2IS, ENSTA Paris, Institut Polytechnique de
145+
Paris</span
146+
>
147+
<span class="author-block"
148+
><sup>4</sup>Fondazione Bruno Kessler</span
149+
>
150+
</div>
151+
152+
<div class="column has-text-centered">
153+
<div class="publication-links">
154+
<span class="link-block">
155+
<a
156+
href="https://github.com/intelligolabs/CoIN"
157+
class="external-link button is-normal is-rounded is-dark"
158+
>
159+
<span class="icon">
160+
<i class="fab fa-github"></i>
161+
</span>
162+
<span>Code (coming soon)</span>
163+
</a>
164+
</span>
165+
<span class="link-block">
166+
<a
167+
href=""
168+
class="external-link button is-normal is-rounded is-dark"
169+
>
170+
<span class="icon">
171+
<i class="far fa-images"></i>
172+
</span>
173+
<span>Data - CoIN-Bench (coming soon)</span>
174+
</a>
175+
</span>
176+
<span class="link-block">
177+
<a
178+
href=""
179+
class="external-link button is-normal is-rounded is-dark"
180+
>
181+
<span class="icon">
182+
<i class="far fa-images"></i>
183+
</span>
184+
<span>Data - IDKVQA dataset (coming soon)</span>
185+
</a>
186+
</span>
187+
</div>
188+
</div>
189+
</div>
190+
</div>
191+
</div>
192+
</div>
193+
</section>
194+
195+
<section class="hero teaser">
196+
<div class="container">
197+
<div class="columns is-centered">
198+
<div class="column is-max-desktop">
199+
<div class="content has-text-centered">
200+
<div class="columns is-centered">
201+
<div class="column is-10">
202+
<img
203+
class="image is-fullwidth"
204+
src="./static/images/teaser.png"
205+
alt="Teaser"
206+
/>
207+
<div class="content has-text-justified">
208+
<p style="padding: 0px 2em 0 2em">
209+
Sketched episode of the proposed
210+
<b><i>Collaborative Instance Navigation (CoIN)</i></b>
211+
task. The human user (bottom left) provides a request (<i
212+
>"Find the picture"</i
213+
>
214+
) in <i>natural language</i>. The agent has to locate the
215+
object within a <i>completely unknown environment</i>,
216+
interacting with the user only when needed via
217+
<i>template-free, open-ended</i> natural-language
218+
dialogue. Our method, <b>A</b>gent-user <b>I</b>nteraction
219+
with <b>U</b>ncerTainty <b>A</b>wareness (<b>AIUTA</b>),
220+
addresses this challenging task, minimizing user
221+
interactions by equipping the agent with two modules: a
222+
<b>Self-Questioner</b> and an <b>Interaction Trigger</b>,
223+
whose output is shown in the blue boxes along the agent’s
224+
path (① to ⑤), and whose inner working is shown on the
225+
right. The <b>Self-Questioner</b> leverages a Large
226+
Language Model (LLM) and Vision Language Model (VLM) in a
227+
self-dialogue to initially describe the agent’s
228+
observation, and then extract additional relevant details,
229+
with a novel entropy-based technique to reduce
230+
<b
231+
><font color="red"
232+
>hallucinations and inaccuracies</font
233+
></b
234+
>, producing a refined
235+
<b><font color="green">detection description</font></b
236+
>. The<b>Interaction Trigger</b> uses this refined
237+
description to decide whether to pose a question to the
238+
user (①,③,④), continue the navigation (②) or halt the
239+
exploration (⑤).
240+
</p>
241+
</div>
242+
</div>
243+
</div>
244+
</div>
245+
</div>
246+
</div>
247+
</div>
248+
</section>
249+
250+
<section class="section">
251+
<div class="container is-max-desktop">
252+
<!-- Abstract. -->
253+
<div class="columns is-centered has-text-centered">
254+
<div class="column is-full">
255+
<h2 class="title is-3">Abstract</h2>
256+
<div class="content has-text-justified">
257+
<p>
258+
Existing embodied instance goal navigation tasks, driven by
259+
natural language, assume human users to provide complete and
260+
nuanced instance descriptions prior to the navigation, which can
261+
be impractical in the real world as human instructions might be
262+
brief and ambiguous.
263+
</p>
264+
<p>
265+
&nbsp;&nbsp;&nbsp;To bridge this gap, we propose a new task,
266+
Collaborative Instance Navigation (CoIN), with dynamic
267+
agent-human interaction during navigation to actively resolve
268+
uncertainties about the target instance in natural,
269+
template-free, open-ended dialogues.
270+
</p>
271+
<p>
272+
&nbsp;&nbsp;&nbsp;To address CoIN, we propose a novel method,
273+
Agent-user Interaction with UncerTainty Awareness (AIUTA),
274+
leveraging the perception capability of Vision Language Models
275+
(VLMs) and the capability of Large Language Models (LLMs).
276+
First, upon object detection, a Self-Questioner model initiates
277+
a self-dialogue to obtain a complete and accurate observation
278+
description, while a novel uncertainty estimation technique
279+
mitigates inaccurate VLM perception. Then, an Interaction
280+
Trigger module determines whether to ask a question to the user,
281+
continue or halt navigation, minimizing user input.
282+
</p>
283+
284+
<p>
285+
&nbsp;&nbsp;&nbsp;For evaluation, we introduce CoIN-Bench, a
286+
benchmark supporting both real and simulated humans. AIUTA
287+
achieves competitive performance in instance navigation against
288+
state-of-the-art methods, demonstrating great flexibility in
289+
handling user inputs.
290+
</p>
291+
</div>
292+
</div>
293+
</div>
294+
<!--/ Abstract. -->
295+
296+
<!-- Paper video. -->
297+
<div class="columns is-centered has-text-centered">
298+
<div class="column is-is-full">
299+
<h2 class="title is-3">Video</h2>
300+
<div class="publication-video">
301+
<iframe
302+
style="display: block; background-color: white"
303+
src="static/videos/aiuta_demo.mp4"
304+
frameborder="0"
305+
width="1920"
306+
height="1080"
307+
allow="autoplay; encrypted-media"
308+
allowfullscreen
309+
></iframe>
310+
</div>
311+
</div>
312+
</div>
313+
<!--/ Paper video. -->
314+
</div>
315+
</section>
316+
<!--
317+
<section class="section" id="BibTeX">
318+
<div class="container is-max-desktop content">
319+
<h2 class="title">BibTeX</h2>
320+
<pre><code>TODO</code></pre>
321+
</div>
322+
</section> -->
323+
324+
<footer class="footer">
325+
<div class="container">
326+
<div class="content has-text-centered"></div>
327+
<div class="columns is-centered">
328+
<div class="column is-8">
329+
<div class="content">
330+
<p>
331+
This website is licensed under a
332+
<a
333+
rel="license"
334+
href="http://creativecommons.org/licenses/by-sa/4.0/"
335+
>Creative Commons Attribution-ShareAlike 4.0 International
336+
License</a
337+
>.
338+
</p>
339+
<p>
340+
This means you are free to borrow the
341+
<a href="https://github.com/nerfies/nerfies.github.io"
342+
>source code</a
343+
>
344+
of this website, we just ask that you link back to this page in
345+
the footer. Please remember to remove the analytics code
346+
included in the header of the website which you do not want on
347+
your website.
348+
</p>
349+
</div>
350+
</div>
351+
</div>
352+
</div>
353+
</footer>
354+
</body>
355+
</html>

static/css/bulma-carousel.min.css

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)