update teaser

NVlabs · Jan 8, 2025 · 09c797e · 09c797e
1 parent e2c90b9
commit 09c797e
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -1 +1 @@
-Project page for COAT.
+Project page for VILA.
diff --git a/asset/teaser.jpg b/asset/teaser.jpg
diff --git a/index.html b/index.html
@@ -9,7 +9,9 @@
     <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>NVILA: Efficient Frontiers of Visual Language Models</title>
+    <title>
+        NVILA: Efficient Vision Language Models
+    </title>
     <style>
         :root {
             color-scheme: light;
@@ -636,8 +638,8 @@
     <!--    </div>-->
     <div class="hero">
         <h2>
-            <img src="asset/NVILA.png" alt="Logo" style="width: 160px; height: auto; margin-right: 3px;">: Efficient
-            Frontier Visual Language Models
+            <!-- <img src="asset/NVILA.png" alt="Logo" style="width: 160px; height: auto; margin-right: 3px;">: Efficient Frontier Visual Language Models -->
+            NVILA: Efficient Vision Language Models
         </h2>
         <p>Train Cheaper, Run Faster, Perform Better!</p>
 
@@ -647,16 +649,16 @@ <h2>
                 <a href="https://zhijianliu.com" target="_blank" style="color: #76b900;">Zhijian Liu</a><sup>1,†</sup>,
                 <a href="https://lzhu.me" target="_blank" style="color: #76b900;">Ligeng Zhu</a><sup>1,†</sup>,
                 <a href="#" target="_blank" style="color: #76b900;">Baifeng Shi</a><sup>1,3</sup>,
-                <a href="#" target="_blank" style="color: #76b900;">Zhuoyang Zhang</a><sup>1,2</sup>,
-                <a href="#" target="_blank" style="color: #76b900;">Yuming Lou</a><sup>1,6</sup>,
-                <a href="#" target="_blank" style="color: #76b900;">Shang Yang</a><sup>1,2</sup>,
+                <a href="#" target="_blank" style="color: #76b900;">Zhuoyang Zhang</a><sup>2</sup>,
+                <a href="#" target="_blank" style="color: #76b900;">Yuming Lou</a><sup>6</sup>,
+                <a href="#" target="_blank" style="color: #76b900;">Shang Yang</a><sup>2</sup>,
                 <a href="https://xijiu9.github.io" target="_blank" style="color: #76b900;">Haocheng
-                    Xi</a><sup>1,3</sup>,
-                <a href="#" target="_blank" style="color: #76b900;">Shiyi Cao</a><sup>1,3</sup>,
+                    Xi</a><sup>3</sup>,
+                <a href="#" target="_blank" style="color: #76b900;">Shiyi Cao</a><sup>3</sup>,
                 <a href="#" target="_blank" style="color: #76b900;">Yuxian Gu</a><sup>2,6</sup>,
-                <a href="#" target="_blank" style="color: #76b900;">Dacheng Li</a><sup>1,3</sup>,
-                <a href="#" target="_blank" style="color: #76b900;">Xiuyu Li</a><sup>1,3</sup>,
-                <a href="#" target="_blank" style="color: #76b900;">Yunhao Fang</a><sup>1,4</sup>,
+                <a href="#" target="_blank" style="color: #76b900;">Dacheng Li</a><sup>3</sup>,
+                <a href="#" target="_blank" style="color: #76b900;">Xiuyu Li</a><sup>3</sup>,
+                <a href="#" target="_blank" style="color: #76b900;">Yunhao Fang</a><sup>4</sup>,
                 <a href="#" target="_blank" style="color: #76b900;">Yukang Chen</a><sup>1</sup>,
                 <a href="#" target="_blank" style="color: #76b900;">Cheng-Yu Hsieh</a><sup>5</sup>,
                 <a href="#" target="_blank" style="color: #76b900;">De-An Huang</a><sup>1</sup>,
@@ -673,7 +675,7 @@ <h2>
                 <a href="https://hanlab.mit.edu/songhan/" target="_blank" style="color: #76b900;">Song
                     Han</a><sup>1,2‡</sup>,
                 <a href="https://scholar.google.com/citations?user=OI7zFmwAAAAJ&hl=en/" target="_blank"
-                    style="color: #76b900;">Yao Lu</a><sup>1,‡</sup>,
+                    style="color: #76b900;">Yao Lu</a><sup>1,†‡</sup>,
             </p>
             <p style="font-size: 1.2em; color: #888;">
                 <sup>1</sup>NVIDIA,
@@ -713,11 +715,13 @@ <h2>
 
         <a href="https://arxiv.org/abs/2412.04468" class="button">Paper</a>
         <a href="https://github.com/NVlabs/VILA" class="button">Code</a>
-        <a href="https://vila.mit.edu/" class="button">Demo</a>
-        <a href="https://huggingface.co/collections/Efficient-Large-Model/nvila-674f8163543890b35a91b428"
-            class="button">Models</a>
+        <a href="https://ligeng-zhu-vila.hf.space/" class="button">Demo</a>
         <a href="https://forms.gle/6nf1QdPYdvC2vgxM8/" class="button">Subscribe</a>
+        <a style="color: #c2c2c2ea;"
+            href="https://huggingface.co/collections/Efficient-Large-Model/nvila-674f8163543890b35a91b428"
+            class="button">Models (coming soon)</a>
         <a href="#bibtex" class="button">Citation</a>
+
     </div>
 
 
@@ -881,21 +885,15 @@ <h2>
         <div class="description-content">
             <h1>About NVILA</h1>
             <p style="margin-bottom: 20px;">
-                Visual language models (VLMs) have made significant advances in accuracy in recent years. However, their
-                efficiency has received much less attention. This paper introduces <strong>NVILA</strong>, a family of
-                open VLMs designed to optimize both efficiency and accuracy. Building on top of VILA, we improve its
-                model architecture by first <strong>scaling up</strong> the spatial and temporal resolutions, and then
-                <strong>compressing</strong> visual tokens. This "scale-then-compress" approach enables NVILA to
-                efficiently process high-resolution images and long videos. We also conduct a systematic investigation
-                to enhance the efficiency of NVILA throughout its entire lifecycle, from training and fine-tuning to
-                deployment. NVILA matches or surpasses the accuracy of many leading open and proprietary VLMs across a
-                wide range of image and video benchmarks. At the same time, it reduces training costs by
-                <strong>4.5×</strong>, fine-tuning memory usage by <strong>3.4×</strong>, pre-filling latency by
-                <strong>1.6-2.2×</strong>, and decoding latency by <strong>1.2-2.8×</strong>. We make our code and
-                models available to facilitate reproducibility.
+                Visual language models (VLMs) have made significant advances in accuracy in recent years. 
+                However, their efficiency has received much less attention. 
+                This paper introduces <strong>NVILA</strong>, a family of open VLMs designed to optimize both efficiency and accuracy. 
+                Building on top of research from NVIDIA including NVILA, VILA and NVLM, we improve its model architecture by first scaling up the spatial and temporal resolutions, and then compressing visual tokens. 
+                This <strong>"scale-then-compress" approach</strong> enables these VLMs to efficiently process high-resolution images and long videos. 
+                We also conduct a systematic investigation to enhance the efficiency of VLMs throughout its entire lifecycle, from training and fine-tuning to deployment. 
+                In this paper, we’ll look at the latest NVILA research that serves as a foundation for NVILA and show how it matches or surpasses the accuracy of many leading open and proprietary VLMs across a wide range of image and video benchmarks. At the same time, it reduces training costs by 4.5×, fine-tuning memory usage by 3.4×, pre-filling latency by 1.6-2.2×, and decoding latency by 1.2-2.8×. We make our code and models available to facilitate reproducibility.
             </p>
-        </div>
-    </section>
+        </div>o    </section>
 
     <section class="description_noborder">
         <!-- Container for the image gallery -->
@@ -963,16 +961,9 @@ <h1>About NVILA</h1>
         <!--  -->
 
         <div class="description-content">
-            <h2>NVILA's core design concept</h2>
+            <h2>NVILA core design concept</h2>
             <p>
-                In this paper, we introduce <strong>NVILA</strong>, a family of open VLMs designed to optimize both
-                efficiency and accuracy. Building on VILA, we improve its model architecture by first scaling up the
-                spatial and temporal resolution, followed by compressing visual tokens. "Scaling" preserves more details
-                from visual inputs, raising the accuracy upper bound, while "compression" squeezes visual information to
-                fewer tokens, improving computational efficiency. This "<em>scale-then-compress</em>" strategy allows
-                NVILA to process high-resolution images and long videos both effectively and efficiently. In addition,
-                we conduct a systematic study to optimize the efficiency of NVILA throughout its entire lifecycle,
-                including training, fine-tuning, and deployment.
+                In this paper, we introduce NVILA, a family of open VLMs designed to optimize both efficiency and accuracy. Building on NVILA and VILA, we improve its model architecture by first scaling up the spatial and temporal resolution, followed by compressing visual tokens. "Scaling" preserves more details from visual inputs, raising the accuracy upper bound, while "compression" squeezes visual information to fewer tokens, improving computational efficiency. This "scale-then-compress" strategy allows VLMs to process high-resolution images and long videos both effectively and efficiently. In addition, we conduct a systematic study to optimize the efficiency of VLMs throughout its entire lifecycle, including training, fine-tuning, and deployment.
             </p>
         </div>
 
@@ -1255,4 +1246,4 @@ <h2 class="title">
     </script>
 </body>
 
-</html>
+</html>