1
1
<!DOCTYPE html>
2
2
< html >
3
- < body >
3
+ < head >
4
+ < meta charset ="utf-8 " />
5
+ < meta
6
+ name ="description "
7
+ content ="Agent Self-Dialogue allows user input minimization "
8
+ />
9
+ < meta
10
+ name ="keywords "
11
+ content ="AIUTA, VLM, LLM, Large Language Model, LLama, Vision Language Model, LLava, Embodied AI, Navigation, Uncertainty, Instance Object Navigation, Object Goal Navigation, VLM uncertainty "
12
+ />
13
+ < meta name ="viewport " content ="width=device-width, initial-scale=1 " />
14
+ < title >
15
+ Collaborative Instance Navigation: Leveraging Agent Self-Dialogue to
16
+ Minimize User Input
17
+ </ title >
4
18
5
- < h1 > tmp</ h1 >
19
+ < link
20
+ href ="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro "
21
+ rel ="stylesheet "
22
+ />
6
23
7
- </ body >
8
- </ html >
24
+ < link rel ="stylesheet " href ="./static/css/bulma.min.css " />
25
+ <!-- <link rel="stylesheet" href="./static/css/bulma-carousel.min.css" />
26
+ <link rel="stylesheet" href="./static/css/bulma-slider.min.css" /> -->
27
+ < link rel ="stylesheet " href ="./static/css/fontawesome.all.min.css " />
28
+ < link
29
+ rel ="stylesheet "
30
+ href ="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css "
31
+ />
32
+ < link rel ="stylesheet " href ="./static/css/index.css " />
33
+ < link rel ="icon " href ="./static/images/flaticon_coin_128.png " />
34
+
35
+ < script src ="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js "> </ script >
36
+ < script defer src ="./static/js/fontawesome.all.min.js "> </ script >
37
+ <!-- <script src="./static/js/bulma-carousel.min.js"></script>
38
+ <script src="./static/js/bulma-slider.min.js"></script> -->
39
+ < script src ="./static/js/index.js "> </ script >
40
+ </ head >
41
+ < body >
42
+ < nav class ="navbar " role ="navigation " aria-label ="main navigation ">
43
+ < div class ="navbar-brand ">
44
+ < a
45
+ role ="button "
46
+ class ="navbar-burger "
47
+ aria-label ="menu "
48
+ aria-expanded ="false "
49
+ >
50
+ < span aria-hidden ="true "> </ span >
51
+ < span aria-hidden ="true "> </ span >
52
+ < span aria-hidden ="true "> </ span >
53
+ </ a >
54
+ </ div >
55
+ < div class ="navbar-menu ">
56
+ < div class ="navbar-start " style ="flex-grow: 1; justify-content: center ">
57
+ < a class ="navbar-item " href ="https://francescotaioli.github.io/ ">
58
+ < span class ="icon ">
59
+ < i class ="fas fa-home "> </ i >
60
+ </ span >
61
+ </ a >
62
+
63
+ < div class ="navbar-item has-dropdown is-hoverable ">
64
+ < a class ="navbar-link "> More Research </ a >
65
+ < div class ="navbar-dropdown ">
66
+ < a
67
+ class ="navbar-item "
68
+ href ="https://intelligolabs.github.io/Le-RNR-Map/ "
69
+ >
70
+ Language-enhanced RNR-Map
71
+ </ a >
72
+ < a
73
+ class ="navbar-item "
74
+ href ="https://intelligolabs.github.io/R2RIE-CE/ "
75
+ >
76
+ Mind the Error in VLN
77
+ </ a >
78
+ < a
79
+ class ="navbar-item "
80
+ href ="https://intelligolabs.github.io/unsupervised_active_visual_search/ "
81
+ >
82
+ POMP-BE-PD
83
+ </ a >
84
+ </ div >
85
+ </ div >
86
+ </ div >
87
+ </ div >
88
+ </ nav >
89
+
90
+ < section class ="hero ">
91
+ < div class ="hero-body ">
92
+ < div class ="container is-max-desktop ">
93
+ < div class ="columns is-centered ">
94
+ < div class ="column has-text-centered ">
95
+ < h1 class ="title is-1 publication-title ">
96
+ Collaborative Instance Navigation: Leveraging Agent
97
+ Self-Dialogue to Minimize User Input
98
+ </ h1 >
99
+ < div class ="is-size-5 publication-authors ">
100
+ < span class ="author-block ">
101
+ < a href ="https://francescotaioli.github.io/ "
102
+ > Francesco Taioli</ a
103
+ > < sup > 1,2</ sup > ,</ span
104
+ >
105
+ < span class ="author-block ">
106
+ < a
107
+ href ="https://scholar.google.com/citations?hl=it&user=fqdv3d4AAAAJ&view_op=list_works&sortby=pubdate "
108
+ > Edoardo Zorzi</ a
109
+ > < sup > 2</ sup > ,
110
+ </ span >
111
+ < span class ="author-block "
112
+ > < a href ="https://giannifranchi.github.io/ "> Gianni Franchi</ a
113
+ > < sup > 3</ sup > ,
114
+ </ span >
115
+ < span class ="author-block ">
116
+ < a href ="https://profs.scienze.univr.it/~castellini/ "
117
+ > Alberto Castellini</ a
118
+ > < sup > 2</ sup > ,
119
+ </ span >
120
+ < span class ="author-block ">
121
+ < a href ="http://profs.sci.univr.it/~farinelli/ "
122
+ > Alessandro Farinelli</ a
123
+ > < sup > 2</ sup > ,
124
+ </ span >
125
+ < span class ="author-block "
126
+ > < a href ="https://www.dimi.univr.it/?ent=persona&id=218 "
127
+ > Marco Cristani</ a
128
+ > < sup > 2</ sup > ,
129
+ </ span >
130
+ < span class ="author-block "
131
+ > < a href ="https://www.yimingwang.it/ "> Yiming Wang</ a
132
+ > < sup > 4</ sup >
133
+ </ span >
134
+ </ div >
135
+
136
+ < div class ="is-size-5 publication-authors ">
137
+ < span class ="author-block "
138
+ > < sup > 1</ sup > Polytechnic of Turin,</ span
139
+ >
140
+ < span class ="author-block "
141
+ > < sup > 2</ sup > University of Verona,</ span
142
+ >
143
+ < span class ="author-block "
144
+ > < sup > 3</ sup > U2IS, ENSTA Paris, Institut Polytechnique de
145
+ Paris</ span
146
+ >
147
+ < span class ="author-block "
148
+ > < sup > 4</ sup > Fondazione Bruno Kessler</ span
149
+ >
150
+ </ div >
151
+
152
+ < div class ="column has-text-centered ">
153
+ < div class ="publication-links ">
154
+ < span class ="link-block ">
155
+ < a
156
+ href ="https://github.com/intelligolabs/CoIN "
157
+ class ="external-link button is-normal is-rounded is-dark "
158
+ >
159
+ < span class ="icon ">
160
+ < i class ="fab fa-github "> </ i >
161
+ </ span >
162
+ < span > Code (coming soon)</ span >
163
+ </ a >
164
+ </ span >
165
+ < span class ="link-block ">
166
+ < a
167
+ href =""
168
+ class ="external-link button is-normal is-rounded is-dark "
169
+ >
170
+ < span class ="icon ">
171
+ < i class ="far fa-images "> </ i >
172
+ </ span >
173
+ < span > Data - CoIN-Bench (coming soon)</ span >
174
+ </ a >
175
+ </ span >
176
+ < span class ="link-block ">
177
+ < a
178
+ href =""
179
+ class ="external-link button is-normal is-rounded is-dark "
180
+ >
181
+ < span class ="icon ">
182
+ < i class ="far fa-images "> </ i >
183
+ </ span >
184
+ < span > Data - IDKVQA dataset (coming soon)</ span >
185
+ </ a >
186
+ </ span >
187
+ </ div >
188
+ </ div >
189
+ </ div >
190
+ </ div >
191
+ </ div >
192
+ </ div >
193
+ </ section >
194
+
195
+ < section class ="hero teaser ">
196
+ < div class ="container ">
197
+ < div class ="columns is-centered ">
198
+ < div class ="column is-max-desktop ">
199
+ < div class ="content has-text-centered ">
200
+ < div class ="columns is-centered ">
201
+ < div class ="column is-10 ">
202
+ < img
203
+ class ="image is-fullwidth "
204
+ src ="./static/images/teaser.png "
205
+ alt ="Teaser "
206
+ />
207
+ < div class ="content has-text-justified ">
208
+ < p style ="padding: 0px 2em 0 2em ">
209
+ Sketched episode of the proposed
210
+ < b > < i > Collaborative Instance Navigation (CoIN)</ i > </ b >
211
+ task. The human user (bottom left) provides a request (< i
212
+ > "Find the picture"</ i
213
+ >
214
+ ) in < i > natural language</ i > . The agent has to locate the
215
+ object within a < i > completely unknown environment</ i > ,
216
+ interacting with the user only when needed via
217
+ < i > template-free, open-ended</ i > natural-language
218
+ dialogue. Our method, < b > A</ b > gent-user < b > I</ b > nteraction
219
+ with < b > U</ b > ncerTainty < b > A</ b > wareness (< b > AIUTA</ b > ),
220
+ addresses this challenging task, minimizing user
221
+ interactions by equipping the agent with two modules: a
222
+ < b > Self-Questioner</ b > and an < b > Interaction Trigger</ b > ,
223
+ whose output is shown in the blue boxes along the agent’s
224
+ path (① to ⑤), and whose inner working is shown on the
225
+ right. The < b > Self-Questioner</ b > leverages a Large
226
+ Language Model (LLM) and Vision Language Model (VLM) in a
227
+ self-dialogue to initially describe the agent’s
228
+ observation, and then extract additional relevant details,
229
+ with a novel entropy-based technique to reduce
230
+ < b
231
+ > < font color ="red "
232
+ > hallucinations and inaccuracies</ font
233
+ > </ b
234
+ > , producing a refined
235
+ < b > < font color ="green "> detection description</ font > </ b
236
+ > . The< b > Interaction Trigger</ b > uses this refined
237
+ description to decide whether to pose a question to the
238
+ user (①,③,④), continue the navigation (②) or halt the
239
+ exploration (⑤).
240
+ </ p >
241
+ </ div >
242
+ </ div >
243
+ </ div >
244
+ </ div >
245
+ </ div >
246
+ </ div >
247
+ </ div >
248
+ </ section >
249
+
250
+ < section class ="section ">
251
+ < div class ="container is-max-desktop ">
252
+ <!-- Abstract. -->
253
+ < div class ="columns is-centered has-text-centered ">
254
+ < div class ="column is-full ">
255
+ < h2 class ="title is-3 "> Abstract</ h2 >
256
+ < div class ="content has-text-justified ">
257
+ < p >
258
+ Existing embodied instance goal navigation tasks, driven by
259
+ natural language, assume human users to provide complete and
260
+ nuanced instance descriptions prior to the navigation, which can
261
+ be impractical in the real world as human instructions might be
262
+ brief and ambiguous.
263
+ </ p >
264
+ < p >
265
+ To bridge this gap, we propose a new task,
266
+ Collaborative Instance Navigation (CoIN), with dynamic
267
+ agent-human interaction during navigation to actively resolve
268
+ uncertainties about the target instance in natural,
269
+ template-free, open-ended dialogues.
270
+ </ p >
271
+ < p >
272
+ To address CoIN, we propose a novel method,
273
+ Agent-user Interaction with UncerTainty Awareness (AIUTA),
274
+ leveraging the perception capability of Vision Language Models
275
+ (VLMs) and the capability of Large Language Models (LLMs).
276
+ First, upon object detection, a Self-Questioner model initiates
277
+ a self-dialogue to obtain a complete and accurate observation
278
+ description, while a novel uncertainty estimation technique
279
+ mitigates inaccurate VLM perception. Then, an Interaction
280
+ Trigger module determines whether to ask a question to the user,
281
+ continue or halt navigation, minimizing user input.
282
+ </ p >
283
+
284
+ < p >
285
+ For evaluation, we introduce CoIN-Bench, a
286
+ benchmark supporting both real and simulated humans. AIUTA
287
+ achieves competitive performance in instance navigation against
288
+ state-of-the-art methods, demonstrating great flexibility in
289
+ handling user inputs.
290
+ </ p >
291
+ </ div >
292
+ </ div >
293
+ </ div >
294
+ <!--/ Abstract. -->
295
+
296
+ <!-- Paper video. -->
297
+ < div class ="columns is-centered has-text-centered ">
298
+ < div class ="column is-is-full ">
299
+ < h2 class ="title is-3 "> Video</ h2 >
300
+ < div class ="publication-video ">
301
+ < iframe
302
+ style ="display: block; background-color: white "
303
+ src ="static/videos/aiuta_demo.mp4 "
304
+ frameborder ="0 "
305
+ width ="1920 "
306
+ height ="1080 "
307
+ allow ="autoplay; encrypted-media "
308
+ allowfullscreen
309
+ > </ iframe >
310
+ </ div >
311
+ </ div >
312
+ </ div >
313
+ <!--/ Paper video. -->
314
+ </ div >
315
+ </ section >
316
+ <!--
317
+ <section class="section" id="BibTeX">
318
+ <div class="container is-max-desktop content">
319
+ <h2 class="title">BibTeX</h2>
320
+ <pre><code>TODO</code></pre>
321
+ </div>
322
+ </section> -->
323
+
324
+ < footer class ="footer ">
325
+ < div class ="container ">
326
+ < div class ="content has-text-centered "> </ div >
327
+ < div class ="columns is-centered ">
328
+ < div class ="column is-8 ">
329
+ < div class ="content ">
330
+ < p >
331
+ This website is licensed under a
332
+ < a
333
+ rel ="license "
334
+ href ="http://creativecommons.org/licenses/by-sa/4.0/ "
335
+ > Creative Commons Attribution-ShareAlike 4.0 International
336
+ License</ a
337
+ > .
338
+ </ p >
339
+ < p >
340
+ This means you are free to borrow the
341
+ < a href ="https://github.com/nerfies/nerfies.github.io "
342
+ > source code</ a
343
+ >
344
+ of this website, we just ask that you link back to this page in
345
+ the footer. Please remember to remove the analytics code
346
+ included in the header of the website which you do not want on
347
+ your website.
348
+ </ p >
349
+ </ div >
350
+ </ div >
351
+ </ div >
352
+ </ div >
353
+ </ footer >
354
+ </ body >
355
+ </ html >
0 commit comments